add smoke test training

make sure to set alternate optimizer and set lr and eps from adam
add soap optimizer support
2024-10-30 15:40:27 -04:00 · 2024-10-30 15:33:37 -04:00 · 2024-10-30 15:33:37 -04:00
97 changed files with 3301 additions and 2387 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -27,7 +27,7 @@ jobs:
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
-            python_version: "3.10"
+            python_version: "3.11"
            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
@@ -40,25 +40,23 @@ jobs:
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.5.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Docker metadata
        id: metadata
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v3
        with:
-          images: |
-            winglian/axolotl-base
-            axolotlai/axolotl-base
+          images: winglian/axolotl-base
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v4
        with:
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -17,7 +17,7 @@ jobs:
        - name: Set up Quarto
          uses: quarto-dev/quarto-actions/setup@v2
        - name: Setup Python
-          uses: actions/setup-python@v5
+          uses: actions/setup-python@v3
          with:
            python-version: '3.10'
        - name: install dependencies
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -15,9 +15,9 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
+      - uses: pre-commit/action@v3.0.0
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,13 +4,11 @@ on:
  push:
    branches:
      - "main"
-    tags:
-      - "v*"
  workflow_dispatch:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -34,7 +32,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -44,12 +42,7 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
-            winglian/axolotl
-            axolotlai/axolotl
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{version}}
+          images: winglian/axolotl
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
@@ -63,7 +56,7 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
@@ -77,7 +70,7 @@ jobs:

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -101,7 +94,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -111,25 +104,20 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
-            winglian/axolotl-cloud
-            axolotlai/axolotl-cloud
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{version}}
+          images: winglian/axolotl-cloud
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
@@ -140,7 +128,7 @@ jobs:

  build-axolotl-cloud-no-tmux:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -158,25 +146,20 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
-            winglian/axolotl-cloud-term
-            axolotlai/axolotl-cloud-term
-          tags: |
-            type=ref,event=branch
-            type=semver,pattern={{version}}
+          images: winglian/axolotl-cloud-term
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud-no-tmux
          push: ${{ github.event_name != 'pull_request' }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,14 +8,9 @@ on:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday

-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
 jobs:
  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -36,7 +31,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.5.0
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -7,7 +7,7 @@ on:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -31,7 +31,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -41,9 +41,7 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
-            winglian/axolotl
-            axolotlai/axolotl
+          images: winglian/axolotl
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Set up Docker Buildx
@@ -71,7 +69,7 @@ jobs:

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -95,7 +93,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -105,9 +103,7 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
-            winglian/axolotl-cloud
-            axolotlai/axolotl-cloud
+          images: winglian/axolotl-cloud
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Login to Docker Hub
@@ -116,7 +112,7 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -3,33 +3,12 @@ name: publish pypi
 on:
  push:
    tags:
-      - 'v*'
-  workflow_dispatch:
+      - '*'

 jobs:
-  setup_release:
-    name: Create Release
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-    steps:
-      - name: Get the tag version
-        id: extract_branch
-        run: echo ::set-output name=branch::${GITHUB_REF#refs/tags/}
-        shell: bash
-
-      - name: Create Release
-        id: create_release
-        uses: actions/create-release@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          tag_name: ${{ steps.extract_branch.outputs.branch }}
-          release_name: ${{ steps.extract_branch.outputs.branch }}
  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
-    needs: [setup_release]
    environment:
      name: pypi
      url: https://pypi.org/p/axolotl
@@ -37,10 +16,10 @@ jobs:
      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: "3.10"

@@ -58,9 +37,9 @@ jobs:
        run: |
          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py

-      - name: Build a source dist
+      - name: Build a binary wheel
        run: |
-          python setup.py sdist
+          python setup.py sdist bdist_wheel

      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -9,12 +9,12 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
+      - uses: pre-commit/action@v3.0.0
        env:
          SKIP: no-commit-to-branch

@@ -25,15 +25,15 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
@@ -48,7 +48,6 @@ jobs:
          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
-          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt

      - name: Install dependencies
        run: |
@@ -83,6 +82,13 @@ jobs:
            num_gpus: 1
            axolotl_extras: mamba-ssm
            nightly_build: "true"
+          - cuda: 121
+            cuda_version: 12.1.1
+            python_version: "3.11"
+            pytorch: 2.3.1
+            num_gpus: 1
+            axolotl_extras: mamba-ssm
+            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -93,7 +99,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.5.0
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -15,22 +15,17 @@ on:
       - '.github/workflows/*.yml'
  workflow_dispatch:

-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
 jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
+      - uses: pre-commit/action@v3.0.0
        env:
          SKIP: no-commit-to-branch

@@ -41,15 +36,15 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
@@ -71,58 +66,18 @@ jobs:

      - name: Run tests
        run: |
-          pytest -n8 --ignore=tests/e2e/ tests/
+          pytest --ignore=tests/e2e/ tests/

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-  docker-e2e-tests-1st:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
-    needs: [pre-commit, pytest]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.1
-            num_gpus: 1
-            axolotl_extras:
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==0.63.64 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.tests
-
  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
-    needs: [pre-commit, pytest, docker-e2e-tests-1st]
+    needs: [pre-commit, pytest]

    strategy:
      fail-fast: false
@@ -134,10 +89,22 @@ jobs:
            pytorch: 2.3.1
            num_gpus: 1
            axolotl_extras: mamba-ssm
+          - cuda: 121
+            cuda_version: 12.1.1
+            python_version: "3.11"
+            pytorch: 2.3.1
+            num_gpus: 1
+            axolotl_extras: mamba-ssm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.0
            num_gpus: 1
            axolotl_extras:
    steps:
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +0,0 @@
-include requirements.txt
-include README.md
-include LICENSE
-recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -1,21 +1,8 @@
-<p align="center">
-    <picture>
-        <source media="(prefers-color-scheme: dark)" srcset="image/axolotl_logo_digital_white.svg">
-        <source media="(prefers-color-scheme: light)" srcset="image/axolotl_logo_digital_black.svg">
-        <img alt="Axolotl" src="image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
-    </picture>
-</p>
+# Axolotl

-<p align="center">
-    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
-    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
-    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
-    <img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
-</p>
-<p align="center">
-    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
-    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
-</p>
+![tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg)
+![tests-nightly](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg)
+![multigpu-semi-weekly tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg)

 Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.

@@ -88,7 +75,7 @@ Features:
 <td>

 <div align="center">
-  <img src="image/axolotl_symbol_digital_white.svg" alt="axolotl" width="160">
+  <img src="image/axolotl.png" alt="axolotl" width="160">
  <div>
    <p>
      <b>Axolotl provides a unified repository for fine-tuning <br />a variety of AI models with ease</b>
@@ -172,7 +159,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
 #### Docker

  ```bash
-  docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+  docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
  ```

  Or run on the current files for development:
@@ -191,7 +178,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
  A more powerful Docker command to run would be this:

  ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-latest
  ```

  It additionally:
@@ -223,7 +210,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --

 #### Cloud GPU

-For cloud GPU providers that support docker images, use [`axolotlai/axolotl-cloud:main-latest`](https://hub.docker.com/r/axolotlai/axolotl-cloud/tags)
+For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)

 - on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
 - on JarvisLabs.ai use this [direct link](https://jarvislabs.ai/templates/axolotl)
@@ -332,7 +319,7 @@ Write a job description in YAML as below:
 # dstack.yaml
 type: task

-image: axolotlai/axolotl-cloud:main-latest
+image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2

 env:
  - HUGGING_FACE_HUB_TOKEN
@@ -396,10 +383,11 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
        - typescript
      type: ... # unimplemented custom format

-      # chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template
+      # fastchat conversation (deprecation soon, use chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template)
+      # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    - path: ...
-      type: chat_template
-      chat_template: chatml # defaults to tokenizer's chat_template
+      type: sharegpt
+      conversation: chatml # default: vicuna_v1.1

      # local
    - path: data.jsonl # or json
@@ -574,8 +562,7 @@ plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
-liger_glu_activation: true
-liger_layer_norm: true
+liger_swiglu: true
 liger_fused_linear_cross_entropy: true
 ```

--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,4 +1,4 @@
-FROM axolotlai/axolotl-base:{{ BASE_TAG }}
+FROM winglian/axolotl-base:{{ BASE_TAG }}

 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
@@ -28,7 +28,6 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 set -e

-pytest -n8 --ignore=tests/e2e/ /workspace/axolotl/tests/
+pytest -n4 --ignore=tests/e2e/ /workspace/axolotl/tests/
 pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/
 pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import App, Image
+from modal import Image, Stub

 cicd_path = pathlib.Path(__file__).parent.resolve()

@@ -46,7 +46,7 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )

-app = App("Axolotl CI/CD", secrets=[])
+stub = Stub("Axolotl CI/CD", secrets=[])


 N_GPUS = int(os.environ.get("N_GPUS", 2))
@@ -61,7 +61,7 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit


-@app.function(
+@stub.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=60 * 60,
@@ -72,6 +72,6 @@ def cicd_pytest():
    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")


-@app.local_entrypoint()
+@stub.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,4 +2,4 @@
 set -e

 # only run one test at a time so as not to OOM the GPU
-pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
+pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import App, Image
+from modal import Image, Stub

 cicd_path = pathlib.Path(__file__).parent.resolve()

@@ -47,7 +47,7 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )

-app = App("Axolotl CI/CD", secrets=[])
+stub = Stub("Axolotl CI/CD", secrets=[])


 N_GPUS = int(os.environ.get("N_GPUS", 1))
@@ -62,7 +62,7 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit


-@app.function(
+@stub.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=60 * 60,
@@ -73,6 +73,6 @@ def cicd_pytest():
    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")


-@app.local_entrypoint()
+@stub.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -1,4 +1,4 @@
-# Example config for debugging the chat_template prompt format
+# Example config for debugging the sharegpt prompt format
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base:$BASE_TAG
+FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main
-FROM axolotlai/axolotl:$BASE_TAG
+FROM winglian/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main
-FROM axolotlai/axolotl:$BASE_TAG
+FROM winglian/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base:$BASE_TAG
+FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -83,7 +83,7 @@ lora_on_cpu: true
 datasets:
  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
-    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
+    # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
    data_files: # Optional[str] path to source data files
@@ -91,7 +91,15 @@ datasets:
    name: # Optional[str] name of dataset configuration to load
    train_on_split: train # Optional[str] name of dataset split to load from
    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
-    trust_remote_code: # Optional[bool] Trust remote code for untrusted source
+
+    # Optional[str] fastchat conversation type, only used with type: sharegpt
+    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    field_human: # Optional[str]. Human key to use for conversation.
+    field_model: # Optional[str]. Assistant key to use for conversation.
+    # Add additional keys from your dataset as input or output roles
+    roles:
+      input: # Optional[List[str]]. These will be masked based on train_on_input
+      output: # Optional[List[str]].

  # Custom user instruction prompt
  - path: repo
@@ -175,8 +183,6 @@ test_datasets:

 # use RL training: 'dpo', 'ipo', 'kto'
 rl:
-# whether to perform weighting if doing DPO training. Boolean.
-dpo_use_weighting:

 # The name of the chat template to use for training, following values are supported:
 # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
@@ -406,7 +412,6 @@ lr_div_factor: # Learning rate div factor
 # - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_apex_fused
-# - adopt_adamw (only for torch version >= 2.5.1)
 # - adafactor
 # - adamw_anyprecision
 # - sgd
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -6,8 +6,33 @@ order: 3

 ## sharegpt

-IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.
+UPDATE: ShareGPT is being deprecated in the next release. Please see `chat_template` section below.

+conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"from": "...", "value": "..."}]}
+```
+
+Note: `type: sharegpt` opens special configs:
+- `conversation`: enables conversions to many Conversation types. Refer to the 'name' [here](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) for options.
+- `roles`: allows you to specify the roles for input and output. This is useful for datasets with custom roles such as `tool` etc to support masking.
+- `field_human`: specify the key to use instead of `human` in the conversation.
+- `field_model`: specify the key to use instead of `gpt` in the conversation.
+
+```yaml
+datasets:
+    path: ...
+    type: sharegpt
+
+    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    field_human: # Optional[str]. Human key to use for conversation.
+    field_model: # Optional[str]. Assistant key to use for conversation.
+    # Add additional keys from your dataset as input or output roles
+    roles:
+      input: # Optional[List[str]]. These will be masked based on train_on_input
+      output: # Optional[List[str]].
+```

 ## pygmalion

@@ -15,6 +40,38 @@ IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.
 {"conversations": [{"role": "...", "value": "..."}]}
 ```

+## sharegpt.load_role
+
+conversations where `role` is used instead of `from`
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"role": "...", "value": "..."}]}
+```
+
+## sharegpt.load_guanaco
+
+conversations where `from` is `prompter` `assistant` instead of default sharegpt
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"from": "...", "value": "..."}]}
+```
+
+## sharegpt.load_ultrachat
+
+conversations where the turns field is 'messages', human is 'user' and gpt is 'assistant'.
+
+```{.json filename="data.jsonl"}
+{"messages": [{"user": "...", "assistant": "..."}]}
+```
+
+## sharegpt_jokes
+
+creates a chat where bot is asked to tell a joke, then explain why the joke is funny
+
+```{.json filename="data.jsonl"}
+{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
+```
+

 ## chat_template

--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -185,7 +185,7 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3

 ## Debugging With Docker

-Using [official Axolotl Docker images](https://hub.docker.com/r/axolotlai/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.
+Using [official Axolotl Docker images](https://hub.docker.com/r/winglian/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.

 ### Setup

@@ -202,11 +202,11 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
 ```

 >[!Tip]
-> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
+> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/winglian/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

 You will now be in the container.  Next, perform an editable install of Axolotl:

--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -44,7 +44,7 @@
   "outputs": [],
   "source": [
    "!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
-    "!pip install flash-attn==\"2.7.0.post2\"\n",
+    "!pip install flash-attn==\"2.5.0\"\n",
    "!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
   ]
  },
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -9,7 +9,7 @@ strict: false
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rms_norm: true
-liger_glu_activation: true
+liger_swiglu: true
 liger_fused_linear_cross_entropy: true

 chat_template: deepseek_v2
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -4,7 +4,7 @@ plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
-liger_glu_activation: true
+liger_swiglu: true
 liger_fused_linear_cross_entropy: true

 strict: false
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -1,93 +0,0 @@
-#Note that we are switching from the regular chat template to chatml.
-#If you experience problems with the special tokens, training for more epochs can help.
-#After training, merge the model before inference otherwise you might
-#face problems with the special tokens.
-
-base_model: mistralai/Mistral-7B-Instruct-v0.2
-model_type: MistralForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-chat_template: chatml
-rl: dpo
-datasets:
-  - path: olivermolenschot/alpaca_messages_dpo_test
-    type: chat_template.default
-    field_messages: conversation
-    field_chosen: chosen
-    field_rejected: rejected
-    message_field_role: role
-    message_field_content: content
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/dpo-qlora
-
-sequence_len: 2048
-sample_packing: false
-pad_to_sequence_len: true
-
-adapter: qlora
-lora_model_dir:
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.2
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-lora_modules_to_save:
- - embed_tokens
- - lm_head
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 16
-num_epochs: 6
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0001
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: false
-s2_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<|im_start|>"
-  eos_token: "<|im_end|>"
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -1,67 +0,0 @@
-base_model: Qwen/Qwen2.5-0.5B
-
-strict: false
-
-chat_template: qwen_25
-rl: dpo
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_dpo_test
-    type: chat_template.default
-    field_messages: conversation
-    field_chosen: chosen
-    field_rejected: rejected
-    message_field_role: role
-    message_field_content: content
-    roles:
-      system:
-        - system
-      user:
-        - user
-      assistant:
-        - assistant
-
-dataset_prepared_path:
-val_set_size: 0.0
-output_dir: ./outputs/dpo-out
-
-sequence_len: 2048
-sample_packing: false
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/image/axolotl-badge-web-legacy.png
+++ b/image/axolotl-badge-web-legacy.png
--- a/image/axolotl-badge-web.png
+++ b/image/axolotl-badge-web.png
--- a/image/axolotl_logo_digital_black.svg
+++ b/image/axolotl_logo_digital_black.svg
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
-    <path fill="#141310" d="M435,234.3l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185.1h31.6l47.9,185.1h-24.5ZM417.7,164.9l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
-    <path fill="#141310" d="M568.2,234.3l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
-    <path fill="#141310" d="M658.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM658.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-    <path fill="#141310" d="M860.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM860.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-    <path fill="#141310" d="M773.9,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
-    <path fill="#141310" d="M1036.2,234.3V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.8v-24.1h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
-    <path fill="#141310" d="M978.6,234.3c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3v-45.3h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
-    <path fill="#141310" d="M51.5,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v32.8h20.6v-32.8c0-4.7,3.8-8.4,8.4-8.4Z"/>
-    <path fill="#141310" d="M92.8,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v12.2h20.6v-12.2c0-4.7,3.8-8.4,8.4-8.4Z"/>
-    <path fill="#141310" d="M249.3,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v32.8h20.6v-32.8Z"/>
-    <path fill="#141310" d="M187.4,90.2v-20.6h-103.1v20.6h-41.2v20.6h-20.6v41.2c0,11.4,9.2,20.6,20.6,20.6h185.5c11.4,0,20.6-9.2,20.6-20.6v-41.2h-20.6v-20.6h-41.2ZM166.8,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3ZM228.7,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3Z"/>
-    <path fill="#141310" d="M208,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v12.2h20.6v-12.2Z"/>
-    <rect fill="#141310" x="22.5" y="234.5" width="41.2" height="20.6"/>
-    <rect fill="#141310" x="84.3" y="234.5" width="164.9" height="20.6"/>
-    <rect fill="#141310" x="208" y="193.3" width="41.2" height="20.6"/>
-    <rect fill="#141310" x="22.5" y="193.3" width="164.9" height="20.6"/>
-</svg>
--- a/image/axolotl_logo_digital_white.svg
+++ b/image/axolotl_logo_digital_white.svg
@@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
-    <path fill="#fff" d="M462.9,234.2l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185h31.6l47.9,185h-24.4ZM445.7,164.8l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
-    <path fill="#fff" d="M596.1,234.2l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.5-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.3,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.1,49.3,71.6h-28.5Z"/>
-    <path fill="#fff" d="M686.4,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM686.4,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-    <path fill="#fff" d="M888.3,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM888.3,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-    <path fill="#fff" d="M801.7,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.5c0,4.7,3.8,8.5,8.5,8.5h16.7v24.1h-16.7Z"/>
-    <path fill="#fff" d="M1063.8,234.2V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.7v-24.1h16.7c18,0,32.6,14.6,32.6,32.6v152.8h-24.1Z"/>
-    <path fill="#fff" d="M1006.2,234.2c-18,0-32.6-14.6-32.6-32.6v-85h-20.3v-22.1h20.3v-45.2h24.1v45.2h30.2v22.1h-30.2v85c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
-    <path fill="#fff" d="M160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM277.3,57.4c0-23.8-19.3-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.7,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.7-6.3-14.1-14.1-14.1h-12.2c-6.5,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.3-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.2c0,11,5.2,20.8,13.2,27.2-7.3.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.7,6.3,14.1,14.1,14.1h41.2c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h164.9c7.7,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.8-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.2c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM77.8,255.1h-41.2v-20.6h41.2v20.6ZM36.5,213.9v-20.6h164.9v20.6H36.5ZM263.3,255.1H98.4v-20.6h164.9v20.6ZM263.3,213.9h-41.2v-20.6h41.2v20.6ZM263.3,90.2h-20.6v20.6h20.6v41.2c0,11.4-9.2,20.6-20.6,20.6H57.2c-11.4,0-20.6-9.2-20.6-20.6v-41.2h20.6v-20.6h-20.6v-32.8c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.2v-20.6h-20.6v-12.2c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.1v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v12.2h-20.6v20.6h41.2v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v32.8ZM201.4,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
-</svg>
--- a/image/axolotl_symbol_digital_black.svg
+++ b/image/axolotl_symbol_digital_black.svg
@@ -1,26 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
-  <defs>
-    <style>
-      .cls-1 {
-        fill: #141310;
-      }
-    </style>
-  </defs>
-  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
-  <g>
-    <g id="Layer_1">
-      <g>
-        <path class="cls-1" d="M46.9,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v36.9h23.2v-36.9c0-5.2,4.2-9.5,9.5-9.5Z"/>
-        <path class="cls-1" d="M93.2,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v13.7h23.2v-13.7c0-5.2,4.2-9.5,9.5-9.5Z"/>
-        <path class="cls-1" d="M269.3,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v36.9h23.2v-36.9Z"/>
-        <path class="cls-1" d="M199.7,83.8v-23.2h-116v23.2h-46.4v23.2H14.2v46.4c0,12.8,10.4,23.2,23.2,23.2h208.7c12.8,0,23.2-10.4,23.2-23.2v-46.4h-23.2v-23.2h-46.4ZM176.5,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6ZM246.1,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6Z"/>
-        <path class="cls-1" d="M222.9,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v13.7h23.2v-13.7Z"/>
-        <rect class="cls-1" x="14.2" y="246.1" width="46.4" height="23.2"/>
-        <rect class="cls-1" x="83.8" y="246.1" width="185.5" height="23.2"/>
-        <rect class="cls-1" x="222.9" y="199.7" width="46.4" height="23.2"/>
-        <rect class="cls-1" x="14.2" y="199.7" width="185.5" height="23.2"/>
-      </g>
-    </g>
-  </g>
-</svg>
--- a/image/axolotl_symbol_digital_white.svg
+++ b/image/axolotl_symbol_digital_white.svg
@@ -1,16 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
-  <defs>
-    <style>
-      .cls-1 {
-        fill: #fff;
-      }
-    </style>
-  </defs>
-  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
-  <g>
-    <g id="Layer_1">
-      <path class="cls-1" d="M152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM269.3,57.3c0-23.8-19.4-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.8,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.8-6.3-14.1-14.1-14.1h-12.2c-6.6,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.4-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.3c0,11,5.2,20.9,13.2,27.2-7.4.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.8,6.3,14.1,14.1,14.1h41.3c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h165.1c7.8,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.9-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.3c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM69.5,255.2H28.2v-20.6h41.3v20.6ZM28.2,214v-20.6h165.1v20.6H28.2ZM255.2,255.2H90.1v-20.6h165.1v20.6ZM255.2,214h-41.3v-20.6h41.3v20.6ZM255.2,90.1h-20.6v20.6h20.6v41.3c0,11.4-9.2,20.6-20.6,20.6H48.9c-11.4,0-20.6-9.2-20.6-20.6v-41.3h20.6v-20.6h-20.6v-32.8c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.3v-20.6h-20.6v-12.2c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.2v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v12.2h-20.6v20.6h41.3v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v32.8ZM193.3,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
-    </g>
-  </g>
-</svg>
--- a/image/axolotl_wordmark_digital_black.svg
+++ b/image/axolotl_wordmark_digital_black.svg
@@ -1,17 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
-  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
-  <g>
-    <g id="Layer_1">
-      <g>
-        <path d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
-        <path d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
-        <path d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-        <path d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-        <path d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
-        <path d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
-        <path d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
-      </g>
-    </g>
-  </g>
-</svg>
--- a/image/axolotl_wordmark_digital_white.svg
+++ b/image/axolotl_wordmark_digital_white.svg
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
-  <defs>
-    <style>
-      .cls-1 {
-        fill: #fff;
-      }
-    </style>
-  </defs>
-  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
-  <g>
-    <g id="Layer_1">
-      <g>
-        <path class="cls-1" d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
-        <path class="cls-1" d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
-        <path class="cls-1" d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-        <path class="cls-1" d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
-        <path class="cls-1" d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
-        <path class="cls-1" d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
-        <path class="cls-1" d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
-      </g>
-    </g>
-  </g>
-</svg>
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,3 +1,2 @@
 pytest
 pytest-xdist
-pytest-retry
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,18 +1,18 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
 peft==0.13.2
-transformers==4.46.3
+transformers==4.46.0
 tokenizers>=0.20.1
 bitsandbytes==0.44.1
-accelerate==1.1.0
-datasets==3.1.0
-deepspeed==0.15.4
+accelerate==1.0.1
+datasets==3.0.1
+deepspeed==0.15.3
 pydantic==2.6.3
 addict
 fire
 PyYAML>=6.0
 requests
-flash-attn==2.7.0.post2
+flash-attn==2.6.3
 sentencepiece
 wandb
 einops
@@ -28,12 +28,13 @@ scipy
 scikit-learn==1.4.2
 pynvml
 art
+fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
 gradio==3.50.2
 tensorboard
 python-dotenv==1.0.1
-autoawq==0.2.7.post2
+autoawq>=0.2.5
 triton>=2.3.0
-liger-kernel==0.4.1
+liger-kernel==0.3.0

 mamba-ssm==1.2.0.post1

@@ -42,7 +43,7 @@ s3fs>=2024.5.0
 gcsfs>=2024.5.0
 # adlfs

-trl==0.12.0
+trl @ git+https://github.com/huggingface/trl.git@31d02cfb795284591a084416b9dcb7bef5d08924
 zstandard==0.22.0
 fastcore

@@ -53,4 +54,3 @@ immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2

 torchao==0.5.0
-schedulefree==1.3.0
--- a/scripts/cloud-entrypoint.sh
+++ b/scripts/cloud-entrypoint.sh
@@ -2,7 +2,7 @@

 # Export specific ENV variables to /etc/rp_environment
 echo "Exporting environment variables..."
-printenv | grep -E '^HF_|^BNB_|^CUDA_|^NCCL_|^NV|^RUNPOD_|^PATH=|^_=' | sed 's/^\([^=]*\)=\(.*\)$/export \1="\2"/' | grep -v 'printenv' >> /etc/rp_environment
+printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
 echo 'source /etc/rp_environment' >> ~/.bashrc

 add_keys_to_authorized() {
--- a/setup.py
+++ b/setup.py
@@ -39,10 +39,7 @@ def parse_requirements():
        else:
            # detect the version of torch already installed
            # and set it so dependencies don't clobber the torch version
-            try:
-                torch_version = version("torch")
-            except PackageNotFoundError:
-                torch_version = "2.5.1"
+            torch_version = version("torch")
            _install_requires.append(f"torch=={torch_version}")

            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
@@ -57,10 +54,6 @@ def parse_requirements():

            if (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
-                    _install_requires.append("xformers==0.0.28.post2")
-                else:
-                    _install_requires.append("xformers==0.0.28.post3")
                _install_requires.pop(_install_requires.index(autoawq_version))
            elif (major, minor) >= (2, 4):
                if patch == 0:
@@ -96,19 +89,22 @@ install_requires, dependency_links = parse_requirements()

 setup(
    name="axolotl",
-    version="0.5.1",
+    version="0.4.1",
    description="LLM Trainer",
    long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
    package_dir={"": "src"},
-    packages=find_packages("src"),
+    packages=find_packages(),
    install_requires=install_requires,
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.7.0.post2",
+            "flash-attn==2.6.3",
+        ],
+        "fused-dense-lib": [
+            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.6.2#subdirectory=csrc/fused_dense_lib",
        ],
        "deepspeed": [
-            "deepspeed==0.15.4",
+            "deepspeed==0.14.4",
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -190,15 +190,18 @@ def do_inference(
 ):
    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
+    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
+
+    for token, symbol in default_tokens.items():
+        # If the token isn't already specified in the config, add it
+        if not (cfg.special_tokens and token in cfg.special_tokens):
+            tokenizer.add_special_tokens({token: symbol})

    prompter_module = None
-    chat_template_str = None
    if prompter:
        prompter_module = getattr(
            importlib.import_module("axolotl.prompters"), prompter
        )
-    elif cfg.chat_template:
-        chat_template_str = get_chat_template(cfg.chat_template)

    model = model.to(cfg.device, dtype=cfg.torch_dtype)

@@ -208,31 +211,13 @@ def do_inference(
        instruction = get_multi_line_input()
        if not instruction:
            return
-
        if prompter_module:
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
        else:
            prompt = instruction.strip()
-
-        if chat_template_str:
-            batch = tokenizer.apply_chat_template(
-                [
-                    {
-                        "role": "user",
-                        "content": prompt,
-                    }
-                ],
-                return_tensors="pt",
-                add_special_tokens=True,
-                add_generation_prompt=True,
-                chat_template=chat_template_str,
-                tokenize=True,
-                return_dict=True,
-            )
-        else:
-            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

        print("=" * 40)
        model.eval()
@@ -272,6 +257,13 @@ def do_inference_gradio(

    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
+    # default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
+    default_tokens: Dict[str, str] = {}
+
+    for token, symbol in default_tokens.items():
+        # If the token isn't already specified in the config, add it
+        if not (cfg.special_tokens and token in cfg.special_tokens):
+            tokenizer.add_special_tokens({token: symbol})

    prompter_module = None
    chat_template_str = None
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -23,6 +23,10 @@ from axolotl.cli import (
 )
 from axolotl.common.cli import PreprocessCliArgs
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
+from axolotl.prompt_strategies.sharegpt import (
+    register_chatml_template,
+    register_llama3_template,
+)
 from axolotl.utils.trainer import disable_datasets_caching

 LOG = logging.getLogger("axolotl.cli.preprocess")
@@ -40,6 +44,23 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        return_remaining_strings=True
    )

+    if parsed_cfg.chat_template == "chatml":
+        if parsed_cfg.default_system_message:
+            LOG.info(
+                f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
+            )
+            register_chatml_template(parsed_cfg.default_system_message)
+        else:
+            register_chatml_template()
+    elif parsed_cfg.chat_template == "llama3":
+        if parsed_cfg.default_system_message:
+            LOG.info(
+                f"LLaMA-3 set. Adding default system message: {parsed_cfg.default_system_message}"
+            )
+            register_llama3_template(parsed_cfg.default_system_message)
+        else:
+            register_llama3_template()
+
    if not parsed_cfg.dataset_prepared_path:
        msg = (
            Fore.RED
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -19,6 +19,10 @@ from axolotl.cli import (
 )
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.integrations.base import PluginManager
+from axolotl.prompt_strategies.sharegpt import (
+    register_chatml_template,
+    register_llama3_template,
+)
 from axolotl.train import train

 LOG = logging.getLogger("axolotl.cli.train")
@@ -38,6 +42,21 @@ def do_train(cfg, cli_args) -> None:
    print_axolotl_text_art()
    check_accelerate_default_config()
    check_user_token()
+    if cfg.chat_template == "chatml" and cfg.default_system_message:
+        LOG.info(
+            f"ChatML set. Adding default system message: {cfg.default_system_message}"
+        )
+        register_chatml_template(cfg.default_system_message)
+    else:
+        register_chatml_template()
+
+    if cfg.chat_template == "llama3" and cfg.default_system_message:
+        LOG.info(
+            f"LLaMA-3 set. Adding default system message: {cfg.default_system_message}"
+        )
+        register_llama3_template(cfg.default_system_message)
+    else:
+        register_llama3_template()

    if cfg.rl:  # and cfg.rl != "orpo":
        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -48,7 +48,6 @@ from trl import (
 )
 from trl.trainer.utils import RewardDataCollatorWithPadding, pad_to_length

-from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils import is_comet_available, is_mlflow_available
@@ -441,7 +440,7 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
                "ao_adamw_8bit",
                "ao_adamw_4bit",
                "ao_adamw_fp8",
-                "adopt_adamw",
+                "soap",
            ]
        ):
            return super().create_optimizer()
@@ -485,6 +484,25 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
                    loraplus_lr_embedding=loraplus_lr_embedding,
                    **optimizer_kwargs,
                )
+            elif self.args.alternate_optimizer == "soap":
+                from axolotl.utils.optimizers.soap import SOAP
+
+                optim_args = {
+                    "lr": optimizer_kwargs.pop("lr"),
+                    "eps": optimizer_kwargs.pop("eps"),
+                }
+
+                if self.cfg.optim_args:
+                    optim_args.update(self.cfg.optim_args)
+
+                optim_args["betas"] = (
+                    self.args.optim_soap_beta1,
+                    self.args.optim_soap_beta2,
+                )
+                self.optimizer = SOAP(  # pylint: disable=attribute-defined-outside-init
+                    optimizer_grouped_parameters,
+                    **optim_args,
+                )
            elif self.args.alternate_optimizer == "optimi_adamw":
                from optimi import AdamW

@@ -511,14 +529,6 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
                    AdamWFp8(optimizer_grouped_parameters, **optimizer_kwargs)
                )
-            elif self.args.alternate_optimizer == "adopt_adamw":
-                from axolotl.utils.optimizers.adopt import ADOPT
-
-                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
-                    ADOPT(
-                        optimizer_grouped_parameters, decoupled=True, **optimizer_kwargs
-                    )
-                )

        if is_sagemaker_mp_enabled():
            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
@@ -910,13 +920,13 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        for key, value in metrics.items():
            self._stored_metrics[train_eval][key].append(value)

-    def _save_checkpoint(self, model, trial, **kwargs):
+    def _save_checkpoint(self, model, trial, metrics=None):
        # make sure the checkpoint dir exists, since trainer is flakey
        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
        run_dir = self._get_output_dir(trial=trial)
        output_dir = os.path.join(run_dir, checkpoint_folder)
        os.makedirs(output_dir, exist_ok=True)
-        return super()._save_checkpoint(model, trial, **kwargs)
+        return super()._save_checkpoint(model, trial, metrics=metrics)


 class AxolotlMambaTrainer(AxolotlTrainer):
@@ -1038,37 +1048,24 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):

        return super().push_to_hub(*args, **kwargs)

-    @staticmethod
    def tokenize_row(
+        self,
        features,
        processing_class,
        max_prompt_length,
        max_completion_length,
        add_special_tokens,
    ) -> Dict:
-        res = DPOTrainer.tokenize_row(
+        res = super().tokenize_row(
            features,
            processing_class,
            max_prompt_length,
            max_completion_length,
            add_special_tokens,
        )
-        # fix when the tokenizer doesn't have a bos_token_id, e.g. Qwen
-        if processing_class.bos_token is None and res["prompt_input_ids"][0] is None:
+        if processing_class.bos_token_id is None and res["prompt_input_ids"][0] is None:
            for key in res.keys():
                res[key] = res[key][1:]
-
-        if processing_class.bos_token and processing_class.bos_token_id is not None:
-            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
-            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
-                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
-                res["chosen_labels"] = res["chosen_labels"][1:]
-                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
-            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
-                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
-                res["rejected_labels"] = res["rejected_labels"][1:]
-                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]
-
        return res

    def training_step(
@@ -1175,12 +1172,6 @@ class TrainerBuilderBase(abc.ABC):

    def get_callbacks(self) -> List[TrainerCallback]:
        callbacks = []
-
-        plugin_manager = PluginManager.get_instance()
-        callbacks.extend(
-            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
-        )
-
        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
@@ -1207,17 +1198,11 @@ class TrainerBuilderBase(abc.ABC):

        return callbacks

+    @abstractmethod
    def get_post_trainer_create_callbacks(self, trainer):
        """
        Callbacks added after the trainer is created, usually b/c these need access to the trainer
        """
-        callbacks = []
-
-        plugin_manager = PluginManager.get_instance()
-        callbacks.extend(
-            plugin_manager.add_callbacks_post_trainer(cfg=self.cfg, trainer=trainer)
-        )
-        return callbacks

    def hook_pre_create_training_args(self, training_arguments_kwargs):
        # TODO
@@ -1263,7 +1248,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
+        callbacks = []
        if self.cfg.use_wandb and self.cfg.eval_table_size > 0:
            LogPredictionCallback = log_prediction_callback_factory(
                trainer, self.tokenizer, "wandb"
@@ -1300,18 +1285,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            callbacks.append(lisa_callback_factory(trainer))
-
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            callbacks.extend(
-                [
-                    cb
-                    for cb in plugin_manager.add_callbacks_post_trainer(
-                        self.cfg, trainer
-                    )
-                    if cb
-                ]
-            )
        return callbacks

    def _get_trainer_cls(self):
@@ -1429,15 +1402,17 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
            # no eval set, so don't eval
-            training_arguments_kwargs["eval_strategy"] = "no"
+            training_arguments_kwargs["evaluation_strategy"] = "no"
        elif self.cfg.eval_steps:
-            training_arguments_kwargs["eval_strategy"] = "steps"
+            training_arguments_kwargs["evaluation_strategy"] = "steps"
            training_arguments_kwargs["eval_steps"] = self.cfg.eval_steps
-        elif self.cfg.eval_strategy:
-            training_arguments_kwargs["eval_strategy"] = self.cfg.eval_strategy
+        elif self.cfg.evaluation_strategy:
+            training_arguments_kwargs[
+                "evaluation_strategy"
+            ] = self.cfg.evaluation_strategy
        else:
            # we have an eval set, but no steps defined, default to use epoch
-            training_arguments_kwargs["eval_strategy"] = "epoch"
+            training_arguments_kwargs["evaluation_strategy"] = "epoch"

        if self.cfg.save_steps:
            training_arguments_kwargs["save_strategy"] = "steps"
@@ -1662,13 +1637,13 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.reward_model:
            trainer_kwargs["max_length"] = self.cfg.sequence_len

-        # pylint: disable=duplicate-code
        if self.cfg.optimizer in [
+            # pylint: disable=duplicate-code
            "optimi_adamw",
            "ao_adamw_4bit",
            "ao_adamw_8bit",
            "ao_adamw_fp8",
-            "adopt_adamw",
+            "soap",
        ]:
            # Set default so transformers doesn't throw
            training_arguments_kwargs["optim"] = "adamw_hf"
@@ -1843,7 +1818,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
+        callbacks = []
        return callbacks

    def build_training_arguments(self, total_num_steps):
@@ -1871,10 +1846,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            training_args_kwargs["save_safetensors"] = self.cfg.save_safetensors

        if self.eval_dataset:
-            training_args_kwargs["eval_strategy"] = "steps"
+            training_args_kwargs["evaluation_strategy"] = "steps"
            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
        else:
-            training_args_kwargs["eval_strategy"] = "no"
+            training_args_kwargs["evaluation_strategy"] = "no"

        if self.cfg.bf16 or self.cfg.bfloat16:
            training_args_kwargs["bf16"] = True
@@ -1929,18 +1904,17 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            # default to saving each epoch if not defined
            training_args_kwargs["save_strategy"] = "epoch"

-        training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-
        if self.cfg.rl_beta:
            training_args_kwargs["beta"] = self.cfg.rl_beta
        if self.cfg.orpo_alpha:
            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
            training_args_kwargs["beta"] = self.cfg.orpo_alpha

+        training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
+        training_args_cls = AxolotlDPOConfig
        if self.cfg.rpo_alpha is not None:
            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha

-        training_args_cls = None
        if self.cfg.rl == "simpo":
            training_args_cls = AxolotlCPOConfig
            training_args_kwargs["loss_type"] = "simpo"
@@ -1949,13 +1923,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha

-        elif self.cfg.rl == "orpo":
+        if self.cfg.rl == "orpo":
            training_args_cls = AxolotlORPOConfig
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len

-        elif self.cfg.rl == "kto":
+        if self.cfg.rl == "kto":
            training_args_cls = AxolotlKTOConfig

            training_args_kwargs["desirable_weight"] = (
@@ -1970,17 +1944,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len

-        else:
-            training_args_cls = AxolotlDPOConfig
-            if self.cfg.rl == "ipo":
-                training_args_kwargs["loss_type"] = "ipo"
-            training_args_kwargs["max_length"] = self.cfg.sequence_len
-            training_args_kwargs["max_completion_length"] = None
-            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
-            training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
-            if self.cfg.dpo_use_weighting is not None:
-                training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
-
        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            output_dir=self.cfg.output_dir,
            per_device_train_batch_size=self.cfg.micro_batch_size,
@@ -2001,6 +1964,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        training_args = self.build_training_arguments(total_num_steps)
        dpo_trainer_kwargs = {}
        if self.cfg.rl == "ipo":
+            dpo_trainer_kwargs["loss_type"] = "ipo"
            if self.cfg.dpo_label_smoothing:
                dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
        if self.eval_dataset:
@@ -2014,6 +1978,12 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.rl in ["dpo", "ipo"]:
            trainer_cls = AxolotlDPOTrainer
            trainer_cls_args = [self.model, self.model_ref]
+
+            # these aren't used for the ORPO trainer
+            dpo_trainer_kwargs["max_length"] = self.cfg.sequence_len
+            dpo_trainer_kwargs["max_target_length"] = None
+            dpo_trainer_kwargs["max_prompt_length"] = self.cfg.sequence_len
+            dpo_trainer_kwargs["generate_during_eval"] = self.cfg.use_wandb
        elif self.cfg.rl == "orpo":
            trainer_cls = AxolotlORPOTrainer
            trainer_cls_args = [self.model]
@@ -2057,11 +2027,11 @@ class HFPPOTrainerBuilder(TrainerBuilderBase):
    """

    def get_callbacks(self):
-        callbacks = super().get_callbacks()
+        callbacks = []
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
+        callbacks = []
        return callbacks

    def build(self, total_num_steps):
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -18,10 +18,9 @@ Plugins can be used to integrate third-party models, modify the training process

 To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
 """
-import collections
 import importlib
 import logging
-from typing import OrderedDict
+from typing import List


 class BasePlugin:
@@ -48,7 +47,7 @@ class BasePlugin:
        Initializes the BasePlugin.
        """

-    def register(self, cfg):  # pylint: disable=unused-argument
+    def register(self, cfg):
        """
        Registers the plugin with the given configuration.

@@ -64,7 +63,7 @@ class BasePlugin:
        Returns a pydantic model for the plugin's input arguments.
        """

-    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
+    def pre_model_load(self, cfg):
        """
        Performs actions before the model is loaded.

@@ -75,7 +74,7 @@ class BasePlugin:
        None
        """

-    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
+    def post_model_load(self, cfg, model):
        """
        Performs actions after the model is loaded.

@@ -87,7 +86,7 @@ class BasePlugin:
        None
        """

-    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+    def pre_lora_load(self, cfg, model):
        """
        Performs actions before LoRA weights are loaded.

@@ -99,7 +98,7 @@ class BasePlugin:
        None
        """

-    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+    def post_lora_load(self, cfg, model):
        """
        Performs actions after LoRA weights are loaded.

@@ -111,7 +110,7 @@ class BasePlugin:
        None
        """

-    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
+    def create_optimizer(self, cfg, trainer):
        """
        Creates and returns an optimizer for training.

@@ -123,9 +122,7 @@ class BasePlugin:
        object: The created optimizer.
        """

-    def create_lr_scheduler(
-        self, cfg, trainer, optimizer
-    ):  # pylint: disable=unused-argument
+    def create_lr_scheduler(self, cfg, trainer, optimizer):
        """
        Creates and returns a learning rate scheduler.

@@ -138,9 +135,9 @@ class BasePlugin:
        object: The created learning rate scheduler.
        """

-    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
+    def add_callbacks_pre_trainer(self, cfg, model):
        """
-        setup callbacks before creating the trainer.
+        Adds callbacks to the trainer before training.

        Parameters:
        cfg (dict): The configuration for the plugin.
@@ -149,25 +146,20 @@ class BasePlugin:
        Returns:
        List[callable]: A list of callback functions to be added to the TrainingArgs
        """
-        return []

-    def add_callbacks_post_trainer(
-        self, cfg, trainer
-    ):  # pylint: disable=unused-argument
+    def add_callbacks_post_trainer(self, cfg, trainer):
        """
-        Adds callbacks to the trainer after creating the trainer.
-        This is useful for callbacks that require access to the model or trainer.
+        Adds callbacks to the trainer after training.

        Parameters:
        cfg (dict): The configuration for the plugin.
        trainer (object): The trainer object for training.

        Returns:
-        List[callable]: A list of callback functions to be added
+        List[callable]: A list of callback functions to be added to the TrainingArgs
        """
-        return []

-    def post_train(self, cfg, model):  # pylint: disable=unused-argument
+    def post_train(self, cfg, model):
        """
        Performs actions after training is complete.

@@ -179,7 +171,7 @@ class BasePlugin:
        None
        """

-    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
+    def post_train_unload(self, cfg):
        """
        Performs actions after training is complete and the model is unloaded.

@@ -235,7 +227,7 @@ class PluginManager:
    pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
    """

-    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()
+    plugins: List[BasePlugin] = []

    _instance = None

@@ -245,7 +237,7 @@ class PluginManager:
        """
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
-            cls._instance.plugins = collections.OrderedDict()
+            cls._instance.plugins: List[BasePlugin] = []
        return cls._instance

    @staticmethod
@@ -273,7 +265,7 @@ class PluginManager:
        """
        try:
            plugin = load_plugin(plugin_name)
-            self.plugins[plugin_name] = plugin
+            self.plugins.append(plugin)
        except ImportError:
            logging.error(f"Failed to load plugin: {plugin_name}")

@@ -285,7 +277,7 @@ class PluginManager:
        list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
        """
        input_args = []
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            input_args_from_plugin = plugin.get_input_args()
            if input_args_from_plugin is not None:
                input_args.append(input_args_from_plugin)
@@ -301,7 +293,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            plugin.pre_model_load(cfg)

    def post_model_load(self, cfg, model):
@@ -315,7 +307,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            plugin.post_model_load(cfg, model)

    def pre_lora_load(self, cfg, model):
@@ -329,7 +321,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            plugin.pre_lora_load(cfg, model)

    def post_lora_load(self, cfg, model):
@@ -343,7 +335,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            plugin.post_lora_load(cfg, model)

    def create_optimizer(self, cfg, trainer):
@@ -357,7 +349,7 @@ class PluginManager:
        Returns:
        object: The created optimizer, or None if none was found.
        """
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            optimizer = plugin.create_optimizer(cfg, trainer)
            if optimizer is not None:
                return optimizer
@@ -375,7 +367,7 @@ class PluginManager:
        Returns:
        object: The created learning rate scheduler, or None if none was found.
        """
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            scheduler = plugin.create_lr_scheduler(cfg, trainer, optimizer)
            if scheduler is not None:
                return scheduler
@@ -393,10 +385,8 @@ class PluginManager:
        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
-        for plugin in self.plugins.values():
-            plugin_callbacks = plugin.add_callbacks_pre_trainer(cfg, model)
-            if plugin_callbacks:  # if the plugin returned a list of callbacks
-                callbacks.extend(plugin_callbacks)
+        for plugin in self.plugins:
+            callbacks.extend(plugin.add_callbacks_pre_trainer(cfg, model))
        return callbacks

    def add_callbacks_post_trainer(self, cfg, trainer):
@@ -411,10 +401,8 @@ class PluginManager:
        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
-        for plugin in self.plugins.values():
-            plugin_callbacks = plugin.add_callbacks_post_trainer(cfg, trainer)
-            if plugin_callbacks:
-                callbacks.extend(plugin_callbacks)
+        for plugin in self.plugins:
+            callbacks.extend(plugin.add_callbacks_post_trainer(cfg, trainer))
        return callbacks

    def post_train_unload(self, cfg):
@@ -428,5 +416,5 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins.values():
+        for plugin in self.plugins:
            plugin.post_train_unload(cfg)
--- a/src/axolotl/integrations/grokfast/README.md
+++ b/src/axolotl/integrations/grokfast/README.md
@@ -1,13 +0,0 @@
-# Grokfast Optimizer
-
-See https://github.com/ironjr/grokfast
-
-### Usage
-
-```yaml
-plugins:
-  - axolotl.integrations.grokfast.GrokfastPlugin
-
-grokfast_alpha: 2.0
-grokfast_lamb: 0.98
-```
--- a/src/axolotl/integrations/grokfast/init.py
+++ b/src/axolotl/integrations/grokfast/init.py
@@ -1,50 +0,0 @@
-"""
-Grokfast plugin for Axolotl
-"""
-import logging
-
-from transformers.trainer_callback import TrainerCallback
-
-from ..base import BasePlugin
-from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
-from .optimizer import gradfilter_ema
-
-LOG = logging.getLogger("axolotl.integrations.grokfast")
-
-
-class GrokfastCallbackHandler(TrainerCallback):
-    """
-    Transformer trainer callbacks for Grokfast
-    """
-
-    def __init__(self, *args_, alpha=0.98, lamb=2.0, **kwargs):
-        super().__init__(*args_, **kwargs)
-        self.grads = None
-        self.alpha = alpha
-        self.lamb = lamb
-
-    def on_train_begin(self, *args_, **kwargs):  # pylint: disable=unused-argument
-        self.grads = None
-
-    def on_pre_optimizer_step(
-        self, args_, state, control, **kwargs
-    ):  # pylint: disable=unused-argument
-        model = kwargs.pop("model")
-        self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
-        return control
-
-
-class GrokfastPlugin(BasePlugin):
-    """
-    Plugin for Grokfast optimizer integraton with Axolotl.
-    """
-
-    def get_input_args(self):
-        return "axolotl.integrations.grokfast.GrokfastArgs"
-
-    def add_callbacks_post_trainer(self, cfg, trainer):
-        LOG.info("Adding Grokfast callback to the trainer")
-        callback = GrokfastCallbackHandler(
-            alpha=cfg.grokfast_alpha, lamb=cfg.grokfast_lamb
-        )
-        return [callback]
--- a/src/axolotl/integrations/grokfast/args.py
+++ b/src/axolotl/integrations/grokfast/args.py
@@ -1,15 +0,0 @@
-"""
-config args for grokfast plugin
-"""
-from typing import Optional
-
-from pydantic import BaseModel
-
-
-class GrokfastArgs(BaseModel):
-    """
-    Input args for Grokfast optimizer.
-    """
-
-    grokfast_alpha: Optional[float] = 0.98
-    grokfast_lamb: Optional[float] = 2.0
--- a/src/axolotl/integrations/grokfast/optimizer.py
+++ b/src/axolotl/integrations/grokfast/optimizer.py
@@ -1,63 +0,0 @@
-# Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
-# Reference: https://github.com/ironjr/grokfast
-
-# pylint: skip-file
-from collections import deque
-from typing import Dict, Literal, Optional
-
-import torch
-import torch.nn as nn
-
-
-def gradfilter_ma(
-    m: nn.Module,
-    grads: Optional[Dict[str, deque]] = None,
-    window_size: int = 100,
-    lamb: float = 5.0,
-    filter_type: Literal["mean", "sum"] = "mean",
-    warmup: bool = True,
-    trigger: bool = False,  # For ablation study.
-) -> Dict[str, deque]:
-    if grads is None:
-        grads = {
-            n: deque(maxlen=window_size)
-            for n, p in m.named_parameters()
-            if p.requires_grad and p.grad is not None
-        }
-
-    for n, p in m.named_parameters():
-        if p.requires_grad and p.grad is not None:
-            grads[n].append(p.grad.data.detach())  # .cpu())
-
-            # Modify the gradients.
-            if not warmup or len(grads[n]) == window_size and not trigger:
-                if filter_type == "mean":
-                    avg = sum(grads[n]) / len(grads[n])
-                elif filter_type == "sum":
-                    avg = sum(grads[n])
-                else:
-                    raise ValueError(f"Unrecognized filter_type {filter_type}")
-                p.grad.data = p.grad.data + avg * lamb
-
-    return grads
-
-
-def gradfilter_ema(
-    m: nn.Module,
-    grads: Optional[Dict[str, torch.Tensor]] = None,
-    alpha: float = 0.98,
-    lamb: float = 2.0,
-) -> Dict[str, torch.Tensor]:
-    if grads is None:
-        grads = {
-            n: p.grad.data.detach()
-            for n, p in m.named_parameters()
-            if p.requires_grad and p.grad is not None
-        }
-
-    for n, p in m.named_parameters():
-        if p.requires_grad and p.grad is not None:
-            grads[n] = grads[n] * alpha + p.grad.data.detach() * (1 - alpha)
-            p.grad.data = p.grad.data + grads[n] * lamb
-
-    return grads
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -18,24 +18,20 @@ Module for the Plugin for LIGER integraton with Axolotl.
 Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
-import inspect
 import logging
 import sys
+from functools import partial

 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
-from liger_kernel.transformers.functional import liger_cross_entropy
-from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
+from liger_kernel.transformers.geglu import LigerGEGLUMLP
 from liger_kernel.transformers.rms_norm import LigerRMSNorm
 from liger_kernel.transformers.rope import liger_rotary_pos_emb
 from liger_kernel.transformers.swiglu import LigerSwiGLUMLP

 from axolotl.integrations.base import BasePlugin

-from ...utils.distributed import zero_only
 from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401

-LOG = logging.getLogger("axolotl.integrations.liger")
-

 class LigerPlugin(BasePlugin):
    """
@@ -46,31 +42,59 @@ class LigerPlugin(BasePlugin):
        return "axolotl.integrations.liger.LigerArgs"

    def pre_model_load(self, cfg):
-        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
-            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
-            liger_fn_sig = inspect.signature(apply_liger_fn)
-            kwargs = {}
-            if "rope" in liger_fn_sig.parameters:
-                kwargs["rope"] = cfg.liger_rope
-            if "cross_entropy" in liger_fn_sig.parameters:
-                kwargs["cross_entropy"] = cfg.liger_cross_entropy
-            if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
-                kwargs[
-                    "fused_linear_cross_entropy"
-                ] = cfg.liger_fused_linear_cross_entropy
-            if "rms_norm" in liger_fn_sig.parameters:
-                kwargs["rms_norm"] = cfg.liger_rms_norm
-            if "layer_norm" in liger_fn_sig.parameters:
-                kwargs["layer_norm"] = cfg.liger_layer_norm
-            if "geglu" in liger_fn_sig.parameters:
-                kwargs["geglu"] = cfg.liger_glu_activation
-            elif "swiglu" in liger_fn_sig.parameters:
-                kwargs["swiglu"] = cfg.liger_glu_activation
-            with zero_only():
-                LOG.info(
-                    f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
+        if cfg.model_config_type == "llama":
+            from liger_kernel.transformers.model.llama import (
+                lce_forward as llama_lce_forward,
+            )
+            from transformers.models.llama import modeling_llama
+
+            if cfg.liger_rope:
+                modeling_llama.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_llama.LlamaRMSNorm = LigerRMSNorm
+            if cfg.liger_swiglu:
+                modeling_llama.LlamaMLP = LigerSwiGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_llama.CrossEntropyLoss = LigerCrossEntropyLoss
+            elif cfg.liger_fused_linear_cross_entropy:
+                modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
+
+        elif cfg.model_config_type == "mistral":
+            from liger_kernel.transformers.model.mistral import (
+                lce_forward as mistral_lce_forward,
+            )
+            from transformers.models.mistral import modeling_mistral
+
+            if cfg.liger_rope:
+                modeling_mistral.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_mistral.MistralRMSNorm = LigerRMSNorm
+            if cfg.liger_swiglu:
+                modeling_mistral.MistralMLP = LigerSwiGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
+
+        elif cfg.model_config_type == "gemma":
+            from liger_kernel.transformers.model.gemma import (
+                lce_forward as gemma_lce_forward,
+            )
+            from transformers.models.gemma import modeling_gemma
+
+            if cfg.liger_rope:
+                modeling_gemma.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_gemma.GemmaRMSNorm = partial(
+                    LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
                )
-            apply_liger_fn(**kwargs)
+            if cfg.liger_swiglu:
+                modeling_gemma.GemmaMLP = LigerGEGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_gemma.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
+
        elif cfg.model_config_type == "jamba":
            from transformers.models.jamba import modeling_jamba

@@ -80,14 +104,30 @@ class LigerPlugin(BasePlugin):
                modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
            if cfg.liger_rms_norm:
                modeling_jamba.JambaRMSNorm = LigerRMSNorm
-            if cfg.liger_glu_activation:
+            if cfg.liger_swiglu:
                modeling_jamba.JambaMLP = LigerSwiGLUMLP
            if cfg.liger_cross_entropy:
-                from transformers.loss.loss_utils import nn
-
-                nn.functional.cross_entropy = liger_cross_entropy
+                modeling_jamba.CrossEntropyLoss = LigerCrossEntropyLoss
            if cfg.liger_fused_linear_cross_entropy:
                modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
+
+        elif cfg.model_config_type == "qwen2":
+            from liger_kernel.transformers.model.qwen2 import (
+                lce_forward as qwen2_lce_forward,
+            )
+            from transformers.models.qwen2 import modeling_qwen2
+
+            if cfg.liger_rope:
+                modeling_qwen2.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_qwen2.Qwen2RMSNorm = LigerRMSNorm
+            if cfg.liger_swiglu:
+                modeling_qwen2.Qwen2MLP = LigerSwiGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_qwen2.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
+
        elif cfg.model_config_type == "deepseek_v2":
            from accelerate import init_empty_weights
            from transformers import AutoModelForCausalLM
@@ -106,11 +146,44 @@ class LigerPlugin(BasePlugin):
                logging.warning("Fused liger_rope is not supported for DeepseekV2.")
            if cfg.liger_rms_norm:
                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
-            if cfg.liger_glu_activation:
+            if cfg.liger_swiglu:
                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
            if cfg.liger_cross_entropy:
-                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
-                # nn.CrossEntropyLoss in the forward method.
                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
            if cfg.liger_fused_linear_cross_entropy:
                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
+
+        elif cfg.model_config_type == "gemma2":
+            from transformers.models.gemma2 import modeling_gemma2
+
+            if cfg.liger_rope:
+                modeling_gemma2.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_gemma2.Gemma2RMSNorm = partial(
+                    LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
+                )
+            if cfg.liger_swiglu:
+                modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_gemma2.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                logging.warning(
+                    "Fused linear cross entropy is not supported for Gemma 2."
+                )
+
+        elif cfg.model_config_type == "phi3":
+            from liger_kernel.transformers.model.phi3 import (
+                lce_forward as phi3_lce_forward,
+            )
+            from transformers.models.phi3 import modeling_phi3
+
+            if cfg.liger_rope:
+                modeling_phi3.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_phi3.Phi3RMSNorm = LigerRMSNorm
+            if cfg.liger_swiglu:
+                modeling_phi3.Phi3MLP = LigerSwiGLUMLP
+            if cfg.liger_cross_entropy:
+                modeling_phi3.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -15,12 +15,9 @@
 """
 Module for handling LIGER input arguments.
 """
-import logging
 from typing import Optional

-from pydantic import BaseModel, model_validator
-
-LOG = logging.getLogger("axolotl.integrations.liger.args")
+from pydantic import BaseModel


 class LigerArgs(BaseModel):
@@ -30,24 +27,6 @@ class LigerArgs(BaseModel):

    liger_rope: Optional[bool] = None
    liger_rms_norm: Optional[bool] = None
-    liger_layer_norm: Optional[bool] = None
    liger_swiglu: Optional[bool] = None
-    liger_glu_activation: Optional[bool] = None
    liger_cross_entropy: Optional[bool] = None
    liger_fused_linear_cross_entropy: Optional[bool] = None
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_deprecated_swiglu(cls, data):
-        if data.get("liger_swiglu") is not None:
-            if data.get("liger_glu_activation") is not None:
-                raise ValueError(
-                    "You cannot have both `liger_swiglu` and `liger_glu_activation` set."
-                )
-
-            LOG.warning(
-                "The 'liger_swiglu' argument is deprecated and will be removed in a future release. "
-                "Please use 'liger_glu_activation' instead."
-            )
-            data["liger_glu_activation"] = data.pop("liger_swiglu")
-        return data
--- a/src/axolotl/monkeypatch/fastchat_conversation_turns.py
+++ b/src/axolotl/monkeypatch/fastchat_conversation_turns.py
@@ -0,0 +1,231 @@
+"""
+monkeypatch to add a get_turns method
+"""
+
+import logging
+from typing import Generator, Tuple
+
+from fastchat.conversation import SeparatorStyle
+
+LOG = logging.getLogger("axolotl.monkeypatch.fastchat_conversation_turns")
+
+
+def get_prompt(self) -> str:
+    ret = ""
+    for role, msg in self.get_turns():
+        ret += role + msg
+    return ret
+
+
+def get_turns(  # pylint: disable=too-many-return-statements
+    self,
+) -> Generator[Tuple[str, str], None, None]:
+    """Get the prompt for generation."""
+    system_prompt = self.system_template.format(system_message=self.system_message)
+    if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
+        yield "", system_prompt + self.sep
+        for role, message in self.messages:
+            if message:
+                yield role + ": ", message + self.sep
+            else:
+                yield role + ":", ""
+        return
+    if self.sep_style == SeparatorStyle.ADD_COLON_TWO:
+        seps = [self.sep, self.sep2]
+        yield "", system_prompt + seps[0]
+        for i, (role, message) in enumerate(self.messages):
+            if message:
+                yield role + ": ", message + seps[i % 2]
+            else:
+                yield role + ":", ""
+        return
+    if self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
+        yield "", system_prompt + self.sep
+        for role, message in self.messages:
+            if message:
+                yield role + ": ", message + self.sep
+            else:
+                yield role + ": ", ""  # must be end with a space
+        return
+    if self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
+        yield "", "" if system_prompt == "" else system_prompt + self.sep
+        for role, message in self.messages:
+            if message:
+                yield role + "\n", message + self.sep
+            else:
+                yield role + "\n", ""
+        return
+    if self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
+        yield "", system_prompt
+        for role, message in self.messages:
+            if message:
+                yield role, message + self.sep
+            else:
+                yield role, ""
+        return
+    if self.sep_style == SeparatorStyle.NO_COLON_TWO:
+        seps = [self.sep, self.sep2]
+        yield "", system_prompt
+        for i, (role, message) in enumerate(self.messages):
+            if message:
+                yield role, message + seps[i % 2]
+            else:
+                yield role, ""
+        return
+    if self.sep_style == SeparatorStyle.RWKV:
+        yield "", system_prompt
+        for i, (role, message) in enumerate(self.messages):
+            if message:
+                yield role + ": ", message.replace("\r\n", "\n").replace(
+                    "\n\n", "\n"
+                ) + "\n\n"
+            else:
+                yield role + ":", ""
+        return
+    if self.sep_style == SeparatorStyle.LLAMA2 and self.name != "mistral":
+        if self.system_message:
+            if self.messages:
+                # For llama, the system message is incorporated into the first human instruction
+                first_role, first_msg = self.messages[0]
+                if first_role == self.roles[0]:
+                    system_prompt += first_msg
+                    self.messages.pop(0)
+            yield "", system_prompt
+        for i, (role, message) in enumerate(self.messages):
+            if message:
+                if (i % 2 == 0 and not self.system_message) or (
+                    i % 2 != 0 and self.system_message
+                ):
+                    role = "<s> " + role
+                yield role + " ", message
+            else:
+                yield role, ""
+        return
+    if self.sep_style == SeparatorStyle.LLAMA2 and self.name == "mistral":
+        contains_sys_msg = False
+        if self.system_message:
+            contains_sys_msg = True
+            if self.messages:
+                # There is no clear guidance on how to handle system messages in Mistral so we just prepend it to the first human instruction separated by a newline
+                first_role, first_msg = self.messages[0]
+                if first_role == self.roles[0]:
+                    system_prompt = self.system_template.format(
+                        system_message=" " + self.system_message
+                    )
+                    system_prompt += first_msg
+                    self.messages.pop(0)
+            yield "", system_prompt
+        for i, (role, message) in enumerate(self.messages):
+            if message and i == 0 and not contains_sys_msg:
+                yield "", system_prompt.strip() + " " + message  # if there is no system message, we need to make sure there is the a `<s> [INST]` at the beginning of the first instruction.
+            elif message:
+                yield role + " ", message
+            else:
+                yield role, ""
+        return
+    if self.sep_style == SeparatorStyle.LLAMA3:
+        if self.system_message:
+            # For llama3, the system message is NOT incorporated into the first human instruction
+            # All messages follow <|start_header_id|>' + role + '<|end_header_id|>\n\n'+ message + '<|eot_id|>
+            yield "", system_prompt
+        for i, (role, message) in enumerate(self.messages):
+            if message:
+                yield f"<|start_header_id|>{role}<|end_header_id|>\n\n", f"{message.strip()}<|eot_id|>"
+            else:
+                yield f"<|start_header_id|>{role}<|end_header_id|>\n\n", ""
+        return
+    if self.sep_style == SeparatorStyle.GEMMA:
+        if self.system_message:
+            raise ValueError("Gemma chat template does not support system messages")
+        for i, (role, message) in enumerate(self.messages):
+            prefix = "<bos>" if i == 0 else ""
+            message_str = message if message else ""
+            yield prefix + "<start_of_turn>" + role + "\n", message_str + "<end_of_turn>\n"
+        return
+    if self.sep_style == SeparatorStyle.CHATGLM:
+        # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
+        # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
+        round_add_n = 1 if self.name == "chatglm2" else 0
+        if system_prompt:
+            yield "", system_prompt + self.sep
+
+        for i, (role, message) in enumerate(self.messages):
+            if i % 2 == 0:
+                yield "", f"[Round {i//2 + round_add_n}]{self.sep}"
+
+            if message:
+                yield f"{role}：", f"{message}{self.sep}"
+            else:
+                yield f"{role}：", ""
+        return
+    if self.sep_style == SeparatorStyle.CHATML:
+        yield "", "" if system_prompt == "" else system_prompt + self.sep + "\n"
+        for role, message in self.messages:
+            if message:
+                yield role + "\n", message + self.sep + "\n"
+            else:
+                yield role + "\n", ""
+        return
+    if self.sep_style == SeparatorStyle.CHATGLM3:
+        if self.system_message:
+            yield "", system_prompt
+        for role, message in self.messages:
+            if message:
+                yield role + "\n", " " + message
+            else:
+                yield role
+        return
+    if self.sep_style == SeparatorStyle.CHATINTERN:
+        # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
+        seps = [self.sep, self.sep2]
+        yield "", system_prompt
+        for i, (role, message) in enumerate(self.messages):
+            prefix = "<s>" if i % 2 == 0 else ""
+            if message:
+                yield prefix + role + ":", message + seps[i % 2] + "\n"
+            else:
+                yield role + ":", ""
+        return
+    if self.sep_style == SeparatorStyle.DOLLY:
+        seps = [self.sep, self.sep2]
+        yield "", system_prompt
+        for i, (role, message) in enumerate(self.messages):
+            if message:
+                suffix = "\n\n" if i % 2 == 1 else ""
+                yield role + ":\n", message + seps[i % 2] + suffix
+            else:
+                yield role + ":\n", ""
+        return
+    if self.sep_style == SeparatorStyle.PHOENIX:
+        yield "", system_prompt
+        for role, message in self.messages:
+            if message:
+                yield role + ": ", "<s>" + message + "</s>"
+            else:
+                yield role + ": " + "<s>", ""
+        return
+    if self.sep_style == SeparatorStyle.ROBIN:
+        yield "", system_prompt + self.sep
+        for role, message in self.messages:
+            if message:
+                yield role + ":\n", message + self.sep
+            else:
+                yield role + ":\n", ""
+        return
+    if self.sep_style == SeparatorStyle.FALCON_CHAT:
+        if self.system_message:
+            yield "", system_prompt + self.sep
+        for role, message in self.messages:
+            if message:
+                yield role + ": ", message + self.sep
+            else:
+                yield role + ":", ""
+    else:
+        raise ValueError(f"Invalid style: {self.sep_style}")
+
+
+def add_get_turns_to_conversation():
+    import fastchat.conversation
+
+    fastchat.conversation.Conversation.get_turns = get_turns
+    fastchat.conversation.Conversation.get_prompt = get_prompt
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -1,5 +1,4 @@
 """multipack patching for v2 of sample packing"""
-
 import importlib

 import transformers
@@ -28,28 +27,71 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
 ]


-def patch_for_multipack(model_type, model_name=None, has_remote_code=False):
-    if has_remote_code:
-        patch_remote(model_name)
-    elif hasattr(transformers, "modeling_flash_attention_utils"):
+def patch_for_multipack(model_type, model_name=None, is_remote_code=False):
+    if model_type == "gemmoe":
+        patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
+    elif model_type == "deepseek_v2":
+        patch_remote(model_name, ".configuration_deepseek", ".modeling_deepseek")
+    elif hasattr(transformers, "modeling_flash_attention_utils") and not is_remote_code:
        transformers.modeling_flash_attention_utils._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
+        if model_type == "mixtral" and is_deepspeed_zero3_enabled():
+            patch_mixtral_moe_forward_zero3()
+        return

-    if model_type == "mixtral" and is_deepspeed_zero3_enabled():
-        patch_mixtral_moe_forward_zero3()
+    # retain for legacy
+    if model_type == "mixtral":
+        transformers.models.mixtral.modeling_mixtral._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+        if is_deepspeed_zero3_enabled():
+            patch_mixtral_moe_forward_zero3()
+    elif model_type == "llama":
+        if hasattr(transformers.models.llama.modeling_llama, "_get_unpad_data"):
+            transformers.models.llama.modeling_llama._get_unpad_data = (  # pylint: disable=protected-access
+                get_unpad_data
+            )
+    elif model_type == "mistral":
+        if hasattr(transformers.models.mistral.modeling_mistral, "_get_unpad_data"):
+            transformers.models.llama.modeling_llama._get_unpad_data = (  # pylint: disable=protected-access
+                get_unpad_data
+            )
+    elif model_type == "qwen2":
+        transformers.models.qwen2.modeling_qwen2._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "qwen2_moe":
+        transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "falcon":
+        transformers.models.falcon.modeling_falcon._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "phi":
+        transformers.models.phi.modeling_phi._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "gemma":
+        transformers.models.gemma.modeling_gemma._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "gemma2":
+        transformers.models.gemma2.modeling_gemma2._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "starcoder2":
+        transformers.models.starcoder2.modeling_starcoder2._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )


-def patch_remote(model_name):
+def patch_remote(model_name, config_name, modeling_name):
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    # we need to load the model here in order for modeling_* to be available
    with init_empty_weights():
        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
-    parts = model_config.__class__.__module__.split(".")
-    parts[-1] = parts[-1].replace("configuration_", "modeling_", 1)
-    module_name = ".".join(parts)
+    module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
    modeling_arch = importlib.import_module(module_name)
-    if hasattr(modeling_arch, "_get_unpad_data"):
-        modeling_arch._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
+    modeling_arch._get_unpad_data = get_unpad_data  # pylint: disable=protected-access
--- a/src/axolotl/prompt_strategies/instruct.py
+++ b/src/axolotl/prompt_strategies/instruct.py
@@ -0,0 +1,33 @@
+"""Module containing the InstructShareGPTPromptTokenizingStrategy class"""
+from typing import Any, Dict, Optional
+
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import ShareGPTPrompterV2
+
+
+def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
+    conversation = (
+        ds_cfg["conversation"] if ds_cfg and "conversation" in ds_cfg else None
+    )
+    strategy = InstructShareGPTPromptTokenizingStrategy(
+        # pylint: disable=duplicate-code
+        ShareGPTPrompterV2(
+            conversation=conversation,
+        ),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    return strategy
+
+
+class InstructShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    basic sharegpt strategy to grab conversations from the sample row
+    """
+
+    def get_conversation_thread(self, prompt):
+        return [
+            {"from": "human", "value": prompt["instruction"]},
+            {"from": "gpt", "value": prompt["output"]},
+        ]
--- a/src/axolotl/prompt_strategies/llama2_chat.py
+++ b/src/axolotl/prompt_strategies/llama2_chat.py
@@ -29,7 +29,7 @@ from dataclasses import dataclass, field
 from typing import Generator, List, Sequence

 from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import ALTERNATING_ASSERTION_FAILED_ROLE, IGNORE_TOKEN_ID
+from axolotl.prompters import IGNORE_TOKEN_ID, SHAREGPT_ASSERTION_FAILED_ROLE


@dataclass
@@ -75,7 +75,7 @@ class Llama2ChatConversation:

 class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy):
    """
-    Tokenizing strategy for Llama2 prompts.
+    Tokenizing strategy for ShareGPT prompts.
    adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py
    """

@@ -191,7 +191,7 @@ class Llama2ChatPrompter:  # pylint: disable=too-few-public-methods
        conv.messages = []  # pylint: disable=R0801
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2], ALTERNATING_ASSERTION_FAILED_ROLE
+            assert role == conv.roles[j % 2], SHAREGPT_ASSERTION_FAILED_ROLE
            if sentence["value"]:
                conv.append_message(role, sentence["value"])
        yield conv
--- a/src/axolotl/prompt_strategies/sharegpt.py
+++ b/src/axolotl/prompt_strategies/sharegpt.py
@@ -0,0 +1,223 @@
+"""Module containing the SimpleShareGPTPromptTokenizingStrategy class"""
+
+import logging
+from typing import Any, Dict, Optional, Type
+
+from fastchat.conversation import Conversation, SeparatorStyle, register_conv_template
+
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import ShareGPTPrompterV2
+from axolotl.utils.tokenization import (
+    chatml_to_conversation,
+    merge_consecutive_messages,
+)
+
+LOG = logging.getLogger("axolotl")
+
+
+def register_chatml_template(system_message=None):
+    system_message = system_message or "You are a helpful assistant."
+    register_conv_template(
+        Conversation(
+            name="chatml",
+            system_template="<|im_start|>system\n{system_message}",
+            system_message=system_message,
+            roles=("<|im_start|>user", "<|im_start|>assistant"),
+            sep_style=SeparatorStyle.CHATML,
+            sep="<|im_end|>",
+        )
+    )
+    register_conv_template(
+        Conversation(
+            name="chatml_glaive",
+            system_template="<|im_start|>system\n{system_message}",
+            system_message=system_message,
+            roles=("<|im_start|>user", "<|im_start|>assistant", "<|im_start|>tool"),
+            sep_style=SeparatorStyle.CHATML,
+            sep="<|im_end|>",
+        )
+    )
+
+
+def register_llama3_template(system_message=None):
+    system_message = system_message or "You are a helpful assistant."
+    register_conv_template(
+        Conversation(
+            name="llama3",
+            system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+            system_message=system_message,
+            roles=("user", "assistant"),
+            sep_style=SeparatorStyle.LLAMA3,
+            sep="",
+            stop_str="<|eot_id|>",
+            stop_token_ids=[128001, 128009],
+        )
+    )
+
+
+def build_loader(
+    tokenization_strategy_cls: Type["ShareGPTPromptTokenizingStrategy"],
+    prompter_cls: Type["ShareGPTPrompterV2"],
+    default_conversation: Optional[str] = None,
+):
+    def _load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
+        LOG.warning(
+            "sharegpt type support will be deprecated in the next release of Axolotl. Please use chat_template instead. https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template",
+        )
+        conversation = (
+            ds_cfg["conversation"]
+            if ds_cfg and "conversation" in ds_cfg
+            else default_conversation
+        )
+        field_human = (
+            ds_cfg["field_human"] if ds_cfg and "field_human" in ds_cfg else None
+        )
+        field_model = (
+            ds_cfg["field_model"] if ds_cfg and "field_model" in ds_cfg else None
+        )
+        roles = ds_cfg["roles"].to_dict() if ds_cfg and "roles" in ds_cfg else None
+        strategy = tokenization_strategy_cls(
+            prompter_cls(
+                conversation=conversation,
+                role_key_model=field_model,
+                role_key_human=field_human,
+                roles=roles,
+            ),
+            tokenizer,
+            cfg.train_on_inputs,
+            cfg.sequence_len,
+        )
+        if ds_cfg and "strict" in ds_cfg and hasattr(strategy, "strict"):
+            strategy.strict = ds_cfg["strict"]
+        if ds_cfg and "field_messages" in ds_cfg and hasattr(strategy, "messages"):
+            strategy.messages = ds_cfg["field_messages"]
+        return strategy
+
+    return _load
+
+
+class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    basic sharegpt strategy to grab conversations from the sample row
+    """
+
+    _strict = False
+    _messages = "conversations"
+
+    @property
+    def strict(self):
+        return self._strict
+
+    @strict.setter
+    def strict(self, strict):
+        self._strict = strict
+
+    @property
+    def messages(self):
+        return self._messages
+
+    @messages.setter
+    def messages(self, messages):
+        self._messages = messages
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt[self.messages]
+        if self.strict:
+            return conversations
+        role_key = "from"
+        if "role" in conversations[0].keys():
+            role_key = "role"
+        value_key = "value"
+        if "text" in conversations[0].keys():
+            value_key = "text"
+        elif "content" in conversations[0].keys():
+            value_key = "content"
+        # remap roles - allow for assistant turn"
+        role_map = {
+            "user": "human",
+            "human": "human",
+            "assistant": "gpt",
+            "gpt": "gpt",
+            "system": "system",
+        }
+        turns = [
+            {
+                "from": (
+                    role_map[t[role_key]] if t[role_key] in role_map else t[role_key]
+                ),
+                "value": t[value_key],
+                "weight": 1
+                if "weight" not in t or t["weight"] is None
+                else t["weight"],
+            }
+            for t in conversations
+        ]
+        return turns
+
+
+class SimpleRoleShareGPTPromptTokenizingStrategy(
+    SimpleShareGPTPromptTokenizingStrategy
+):
+    """
+    basic sharegpt strategy to grab conversations from the sample row, but uses role instead of from
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["conversations"]
+        # remap role: prompter/assistant, text: ... => from: human/gpt, value: ...
+        turns = [{"from": t["role"], "value": t["value"]} for t in conversations]
+        return turns
+
+
+class GuanacoShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    sharegpt strategy that remaps oasst data to sharegpt format
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["conversations"]
+        # remap role: prompter/assistant, text: ... => from: human/gpt, value: ...
+        role_map = {"prompter": "human", "assistant": "gpt"}
+        turns = [
+            {"from": role_map[t["role"]], "value": t["text"]} for t in conversations
+        ]
+        return turns
+
+
+class UltrachatShareGPTPromptTokenizingStrategy(SimpleShareGPTPromptTokenizingStrategy):
+    """
+    sharegpt strategy that remaps ultrachat data to sharegpt format
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["messages"]
+        role_map = {"user": "human", "assistant": "gpt"}
+        turns = [
+            {"from": role_map[t["role"]], "value": t["content"]} for t in conversations
+        ]
+        return turns
+
+
+class GlaiveShareGPTPromptTokenizingStrategy(SimpleShareGPTPromptTokenizingStrategy):
+    """
+    sharegpt strategy that remaps glaive data to sharegpt format
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversation = chatml_to_conversation(prompt)
+        conversation = merge_consecutive_messages(conversation)
+
+        return conversation
+
+
+load = build_loader(SimpleShareGPTPromptTokenizingStrategy, ShareGPTPrompterV2)
+load_role = build_loader(SimpleRoleShareGPTPromptTokenizingStrategy, ShareGPTPrompterV2)
+load_ultrachat = build_loader(
+    UltrachatShareGPTPromptTokenizingStrategy, ShareGPTPrompterV2
+)
+load_guanaco = build_loader(GuanacoShareGPTPromptTokenizingStrategy, ShareGPTPrompterV2)
+load_glaive = build_loader(
+    GlaiveShareGPTPromptTokenizingStrategy,
+    ShareGPTPrompterV2,
+    default_conversation="chatml_glaive",
+)
--- a/src/axolotl/prompt_strategies/sharegpt_jokes.py
+++ b/src/axolotl/prompt_strategies/sharegpt_jokes.py
@@ -0,0 +1,28 @@
+"""Module for Jokes prompts using sharegpt style """
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import ShareGPTPrompterV2
+
+
+def load(tokenizer, cfg):
+    return SimpleJokesShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompterV2(),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+class SimpleJokesShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    Tokenization strategy for asking bot to tell a joke and then explain why its funny
+    """
+
+    # title, text, explanation
+    def get_conversation_thread(self, prompt):
+        title = "" if not prompt["title"] else prompt["title"] + " "
+        return [
+            {"from": "human", "value": "Tell me a joke."},
+            {"from": "gpt", "value": title + prompt["text"]},
+            {"from": "human", "value": "Why is that joke funny?"},
+            {"from": "gpt", "value": prompt["explanation"]},
+        ]
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -1,12 +1,17 @@
 """Module containing PromptTokenizingStrategy and Prompter classes"""

 import abc
+import copy
 import logging
 from typing import Dict, List, Tuple, Union

+from fastchat.conversation import Conversation
 from transformers import BatchEncoding, PreTrainedTokenizer

-from axolotl.prompters import Prompter
+from axolotl.monkeypatch.fastchat_conversation_turns import (
+    add_get_turns_to_conversation,
+)
+from axolotl.prompters import IGNORE_TOKEN_ID, Prompter

 LOG = logging.getLogger("axolotl")

@@ -16,6 +21,8 @@ LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
 LLAMA_DEFAULT_BOS_TOKEN = "<s>"  # nosec
 LLAMA_DEFAULT_UNK_TOKEN = "<unk>"  # nosec

+add_get_turns_to_conversation()
+

 class InvalidDataException(Exception):
    """
@@ -324,6 +331,154 @@ class AlpacaReflectionPTStrategy(ReflectionPromptTokenizingStrategy):
        )


+class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
+    """
+    Tokenizing strategy for ShareGPT prompts.
+    """
+
+    def get_conversation_thread(self, prompt):
+        return prompt["conversations"]
+
+    def tokenize_prompt(self, prompt):
+        # Initial values. We will append to these as we go through the conversation.
+        result, current_len = tokenize_prompt_default()
+        conversation: Conversation = (
+            self.prompter._conversation.copy()  # pylint: disable=protected-access
+        )
+
+        input_roles = {conversation.roles[0]}
+        output_roles = {conversation.roles[1]}
+
+        if len(conversation.roles) == 3:
+            tool_role_label = conversation.roles[2]
+            input_roles.add(tool_role_label)
+
+        # Add roles from the config
+        if self.prompter.roles:
+            if "input" in self.prompter.roles and self.prompter.roles["input"]:
+                for role in self.prompter.roles["input"]:
+                    input_roles.add(role)
+
+            if "output" in self.prompter.roles and self.prompter.roles["output"]:
+                for role in self.prompter.roles["output"]:
+                    output_roles.add(role)
+
+        # support for custom roles from the dataset, only useful for vicuna style prompts/roles
+        role_remap = []
+        if (
+            conversation.name == "vicuna_v1.1"
+            and "roles" in prompt
+            and len(prompt["roles"]) >= 2
+        ):
+            role_remap = [
+                {"from": conversation.roles[0], "to": prompt["roles"][0]},
+                {"from": conversation.roles[1], "to": prompt["roles"][1]},
+            ]
+
+        try:
+            for _, part in enumerate(
+                self.prompter.build_prompt(self.get_conversation_thread(prompt))
+            ):
+                if not isinstance(part, tuple):
+                    LOG.warning(f"expected tuple, got {part}")
+                    continue
+
+                if len(part) <= 2:
+                    role, content = part
+                    weight = 1
+                else:
+                    role, content, weight = part
+
+                # Uses "in" because role contains extra characters
+                input_turn = any(r.lower() in role.lower() for r in input_roles)
+                output_turn = any(r.lower() in role.lower() for r in output_roles)
+                empty_role = role.strip() == ""
+
+                if not any([input_turn, output_turn, empty_role]):
+                    LOG.warning(f"unhandled role: {role}")
+                    continue
+
+                if input_turn:
+                    role = (
+                        role.replace(role_remap[0]["from"], role_remap[0]["to"])
+                        if role_remap
+                        else role
+                    )
+                    turn = role + content
+                    # this is still the user query, we should
+                    if not content.strip():
+                        LOG.warning(f"user turn has empty text: {prompt}")
+                    res = self._tokenize(
+                        turn,
+                        add_eos_token=False,
+                        strip_bos_token=True,
+                    )
+                    if self.train_on_inputs and weight == 1:
+                        labels = copy.deepcopy(res["input_ids"])
+                    else:
+                        # everything from this is masked out from the labels
+                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                elif output_turn:
+                    role = (
+                        role.replace(role_remap[1]["from"], role_remap[1]["to"])
+                        if role_remap
+                        else role
+                    )
+                    turn = role + content
+                    # this should be the assistant response, should end with an eos token
+                    if not content.strip():
+                        LOG.warning(f"assistant turn has empty text: {prompt}")
+                    add_eos_token = not (
+                        conversation.name == "chatml"
+                        and conversation.sep == self.tokenizer.eos_token
+                    )
+                    res = self._tokenize(
+                        turn,
+                        add_eos_token=add_eos_token,
+                        strip_bos_token=True,
+                    )
+                    role_res = self._tokenize(
+                        role.rstrip(),
+                        add_eos_token=False,
+                        strip_bos_token=True,
+                    )
+                    labels = copy.deepcopy(res["input_ids"])
+                    if not self.train_on_inputs:
+                        # mask out role tokens from the labels
+                        len_role = len(role_res["input_ids"])
+                        labels[:len_role] = [IGNORE_TOKEN_ID] * min(
+                            len_role, len(labels)
+                        )
+                    if weight == 0:
+                        # everything from this is masked out from the labels
+                        # (role is masked out too because it makes no sense if contents is masked out)
+                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+
+                elif empty_role:
+                    turn = content
+                    # this is only ever the first part, should include the bos token and the user query
+                    res = self._tokenize(
+                        turn, add_eos_token=False, strip_bos_token=False
+                    )
+                    if self.train_on_inputs and weight == 1:
+                        labels = copy.deepcopy(res["input_ids"])
+                    else:
+                        # everything from this is masked out from the labels
+                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+
+                # pylint: disable=duplicate-code
+                result, current_len = parse_tokenized_to_result(
+                    result,
+                    current_len,
+                    res,
+                    labels,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                )
+            return result
+        except (KeyError, AssertionError, IndexError) as err:
+            raise InvalidDataException(str(err)) from err
+
+
 def tokenize_prompt_default() -> Tuple[Dict[str, List[int]], int]:
    """
    Returns the default values for the tokenize prompt function
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -5,6 +5,7 @@ from enum import Enum
 from typing import Generator, Optional, Union

 from colorama import Fore
+from fastchat.conversation import Conversation, get_conv_template

 LOG = logging.getLogger("axolotl")
 IGNORE_TOKEN_ID = -100
@@ -261,10 +262,166 @@ class ReflectAlpacaPrompter(Prompter):
        )


-ALTERNATING_ASSERTION_FAILED_ROLE = (
+SHAREGPT_ASSERTION_FAILED_ROLE = (
    "Role did not alternate between turns (gpt and human). Please check your data."
 )

+CONVERSATION_ROLE_FORMAT = {
+    "chatml": "<|im_start|>{ROLE}",
+    "zephyr": "<|{ROLE}|>",
+    "vicuna_v1.1": "{ROLE}",
+    "llama3": "<|start_header_id|>{ROLE}<|end_header_id|>",
+}
+
+
+class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
+    """
+    A prompter that generates prompts for the ShareGPT
+    """
+
+    role_key_human = "human"
+    role_key_model = "gpt"
+    # Optional, only used for tool usage datasets.
+    role_key_tool: Optional[str] = None
+    # Optional, role input/output mapping
+    roles: Optional[dict] = None
+
+    def __init__(
+        self,
+        prompt_style=None,  # pylint: disable=unused-argument
+        conversation: Optional[Union[str, Conversation]] = None,
+        role_key_human: Optional[str] = None,
+        role_key_model: Optional[str] = None,
+        role_key_tool: Optional[str] = None,
+        roles: Optional[dict] = None,
+    ):
+        if conversation:
+            if isinstance(conversation, Conversation):
+                self._conversation = conversation
+            else:
+                self._conversation = get_conv_template(conversation)
+        else:
+            self._conversation = get_conv_template("vicuna_v1.1")
+        if role_key_human:
+            self.role_key_human = role_key_human
+        if role_key_model:
+            self.role_key_model = role_key_model
+        if role_key_tool:
+            self.role_key_tool = role_key_tool
+        if roles:
+            self.roles = roles
+
+    def _build_result(self, source):
+        if len(source) < 2:
+            # If there isn't a back and forth conversation, ignore it
+            # also happens on the data splitting leaving empty conversations
+            raise IndexError(
+                f"A conversation entry has less than 2 messages :\n{source}"
+            )
+
+        conv = self._conversation.copy()
+
+        original_source = source.copy()
+        # Add the conversation system prompt if provided, otherwise use the default one
+        if source[0]["from"] == "system":
+            conv.set_system_message(source[0]["value"])
+            source.pop(0)
+
+        roles = {self.role_key_human: conv.roles[0], self.role_key_model: conv.roles[1]}
+        if self.role_key_tool:
+            roles[self.role_key_tool] = conv.roles[2]
+
+        try:
+            # Apply prompt templates
+            if source[0]["from"] not in roles:
+                # Skip the first one if it is not from human
+                source = source[1:]
+        except IndexError as err:
+            # sometimes there is a bing or system chat
+            raise err
+
+        conv.messages = []
+        for _, sentence in enumerate(source):
+            from_role = sentence["from"]
+            if from_role in roles:
+                role = roles[from_role]
+            else:
+                if self._conversation.name not in CONVERSATION_ROLE_FORMAT:
+                    raise NotImplementedError(
+                        f"Role ({role}) not in default roles, and {self._conversation.name} does not support role remapping yet."
+                        "Please help us by creating an Issue to add support for this conversation type."
+                    )
+
+                if self._conversation.name in ["llama3"]:
+                    role = from_role
+                else:
+                    role = CONVERSATION_ROLE_FORMAT[self._conversation.name].format(
+                        ROLE=from_role
+                    )
+
+            if len(conv.messages) > 0 and ((role == conv.messages[-1][0])):
+                if (
+                    role != "assistant"
+                ):  # back to back assistant calls may be okay for tool calls
+                    LOG.warning(f"{SHAREGPT_ASSERTION_FAILED_ROLE}: {sentence}")
+
+            conv.append_message(role, sentence["value"])
+        turns = list(conv.get_turns())
+        original_source_length = len(original_source)
+        assert len(turns) in [
+            original_source_length - 1,
+            original_source_length,
+            original_source_length + 1,
+        ]
+        if len(turns) == original_source_length + 1:
+            original_source = [{"weight": None}] + original_source
+        elif len(turns) == original_source_length - 1:
+            original_source = original_source[1:]
+        return [
+            (*turn, weight)
+            for turn, weight in zip(
+                turns,
+                [
+                    1 if "weight" not in e or e["weight"] is None else e["weight"]
+                    for e in original_source
+                ],
+            )
+        ]
+
+    def build_prompt(self, source) -> Generator[str, None, None]:
+        turns = self._build_result(source)
+
+        for part in turns:
+            if part[0] and not part[1]:
+                LOG.warning(f"role with empty message: {part[0]}")
+            yield part
+
+    def __repr__(self) -> str:
+        turns = self._build_result([{"from": "{from}", "value": "{value}"}])
+        return "\n".join([REPR_TEMPLATE.format(full_prompt=part) for part in turns])
+
+
+class ShareGPTPrompterV2(ShareGPTPrompter):
+    """
+    A V2 prompter that generates prompts for the ShareGPT
+    """
+
+    def __init__(
+        self,
+        conversation: Optional[Union[str, Conversation]] = None,
+        role_key_human: Optional[str] = None,
+        role_key_model: Optional[str] = None,
+        role_key_tool: Optional[str] = None,
+        roles: Optional[dict] = None,
+    ):
+        super().__init__(
+            conversation=conversation,
+            role_key_human=role_key_human,
+            role_key_model=role_key_model,
+            role_key_tool=role_key_tool,
+            roles=roles,
+        )
+

 class UnsupportedPrompter(Prompter):
    """
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -64,7 +64,10 @@ class EvalFirstStepCallback(
        control: TrainerControl,
        **kwargs,
    ):
-        if args.eval_strategy == IntervalStrategy.STEPS and state.global_step == 1:
+        if (
+            args.evaluation_strategy == IntervalStrategy.STEPS
+            and state.global_step == 1
+        ):
            control.should_evaluate = True
        return control

--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -1,6 +1,8 @@
 """Module for working with config dicts"""
+import json
 import logging
 import os
+from pathlib import Path
 from typing import Optional

 import torch
@@ -8,6 +10,7 @@ from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.integrations.config import merge_input_args
 from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.config.models.input.v0_4_1 import SUPPORTED_METRICS
 from axolotl.utils.config.models.input.v0_4_1 import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
 )
@@ -212,6 +215,11 @@ def normalize_cfg_datasets(cfg):
    if cfg.chat_template:
        if cfg.datasets:
            for idx, ds_cfg in enumerate(cfg.datasets):
+                if ds_cfg.type == "sharegpt" and not ds_cfg.conversation:
+                    LOG.info(
+                        f"updating dataset {ds_cfg.path} with `conversation: {cfg.chat_template}` to match your chat_template"
+                    )
+                    cfg.datasets[idx].conversation = cfg.chat_template
                if (
                    ds_cfg.type in ["orpo.chat_template", "chat_template"]
                    and not ds_cfg.chat_template
@@ -244,3 +252,391 @@ def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
    return DictDefault(
        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True))
    )
+
+
+def legacy_validate_config(cfg):
+    """
+    This is a "pre-validation" step that handles the yaml configuration before we have any
+    information about the model architecture
+    """
+    if is_torch_bf16_gpu_available():
+        if not cfg.bf16 and not cfg.bfloat16:
+            LOG.info("bf16 support detected, but not enabled for this configuration.")
+    else:
+        if (
+            not cfg.merge_lora
+            and not cfg.is_preprocess
+            and (cfg.bf16 is True or cfg.bfloat16 is True)
+        ):
+            raise ValueError(
+                "bf16 requested, but AMP is not supported on this GPU. Requires Ampere series or above."
+            )
+    if (
+        # pylint: disable=too-many-boolean-expressions
+        not (cfg.bf16 or cfg.bfloat16)
+        and (cfg.fp16 or cfg.float16)
+        and not cfg.adapter
+        and not cfg.flash_attention
+        and cfg.sample_packing
+    ):
+        LOG.warning(
+            "Full fine tune w/o FA2 w/ sample packing and fp16/float16 is likely to raise errors. Try LoRA."
+        )
+        # ValueError: Attempting to unscale FP16 gradients.
+        # OR
+        # RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::Half
+    if cfg.max_packed_sequence_len:
+        raise DeprecationWarning("`max_packed_sequence_len` is no longer supported")
+
+    if cfg.sample_packing and cfg.rl:
+        raise ValueError("`sample_packing: true` does not work with RLHF training")
+
+    if cfg.sample_packing and not cfg.pad_to_sequence_len:
+        LOG.warning(
+            "`pad_to_sequence_len: true` is recommended when using sample_packing"
+        )
+
+    if cfg.gradient_accumulation_steps and cfg.batch_size:
+        raise ValueError(
+            "please set only one of gradient_accumulation_steps or batch_size"
+        )
+    if cfg.batch_size:
+        LOG.warning(
+            "%s\n%s",
+            "batch_size is not recommended. Please use gradient_accumulation_steps instead.",
+            "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
+        )
+    if (
+        cfg.eval_batch_size
+        and cfg.micro_batch_size
+        and cfg.eval_batch_size != cfg.micro_batch_size
+    ):
+        LOG.warning(
+            "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
+        )
+
+    if cfg.adapter == "qlora":
+        if cfg.merge_lora:
+            # can't merge qlora if loaded in 8bit or 4bit
+            if cfg.load_in_8bit:
+                raise ValueError("Can't merge qlora if loaded in 8bit")
+
+            if cfg.gptq:
+                raise ValueError("Can't merge qlora if gptq")
+
+            if cfg.load_in_4bit:
+                raise ValueError("Can't merge qlora if loaded in 4bit")
+
+        else:
+            if cfg.load_in_8bit:
+                raise ValueError("Can't load qlora in 8bit")
+
+            if cfg.gptq:
+                raise ValueError("Can't load qlora if gptq")
+
+            if not cfg.load_in_4bit:
+                raise ValueError("Require cfg.load_in_4bit to be True for qlora")
+
+        if cfg.flash_attn_fuse_qkv or cfg.flash_attn_fuse_mlp:
+            raise ValueError("Fused modules are not supported with QLoRA")
+
+    loftq = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
+    if not cfg.load_in_8bit and cfg.adapter == "lora" and not loftq:
+        LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
+
+    if cfg.adapter == "lora" and (cfg.flash_attn_fuse_qkv or cfg.flash_attn_fuse_mlp):
+        raise ValueError("Fused modules are not supported with LoRA")
+
+    if cfg.adapter and cfg.peft_layers_to_transform and cfg.unfrozen_parameters:
+        raise ValueError(
+            "`unfrozen_parameters` used with `peft_layers_to_transform` can have unexpected behavior."
+        )
+
+    if cfg.relora_steps:
+        if cfg.adapter not in ("lora", "qlora"):
+            raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")
+
+        if cfg.fsdp:
+            raise ValueError("fsdp not supported with ReLoRA")
+
+        if cfg.deepspeed:
+            raise ValueError("deepspeed not supported with ReLoRA")
+
+        if cfg.lr_scheduler == "one_cycle":
+            raise ValueError("ReLoRA is not compatible with the one_cycle scheduler")
+
+        if cfg.flash_attn_fuse_qkv or cfg.flash_attn_fuse_mlp:
+            raise ValueError("Fused modules are not supported with ReLoRA")
+
+    if cfg.trust_remote_code:
+        LOG.warning(
+            "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
+        )
+
+    if cfg.push_dataset_to_hub and cfg.hf_use_auth_token is not True:
+        raise ValueError(
+            "Require cfg.hf_use_auth_token to be True for push_dataset_to_hub"
+        )
+
+    if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
+        raise ValueError("FSDP is not supported for falcon models")
+
+    if (
+        cfg.base_model and "mpt" in cfg.base_model.lower()
+    ) and cfg.gradient_checkpointing:
+        raise ValueError("gradient_checkpointing is not supported for MPT models")
+
+    if cfg.flash_optimum is True:
+        if cfg.adapter:
+            LOG.warning("BetterTransformers probably doesn't work with PEFT adapters")
+        if cfg.fp16 or cfg.bf16:
+            raise ValueError("AMP is not supported with BetterTransformer")
+        if cfg.float16 is not True and cfg.bfloat16 is not True:
+            LOG.warning(
+                "You should probably set bfloat16 or float16 to true to "
+                "load the model in float16 for BetterTransformers"
+            )
+        if int(torch.__version__.split(".", maxsplit=1)[0]) < 2:
+            LOG.warning("torch>=2.0.0 required")
+            raise ValueError(
+                f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
+            )
+
+    if cfg.pretraining_dataset and cfg.group_by_length:
+        LOG.warning(
+            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
+        )
+    if cfg.pretraining_dataset and not cfg.max_steps:
+        raise ValueError(
+            "max_steps must be set when using iterable pretraining_dataset, Trainer can't infer length and schedule optimizer/learning rate without it!"
+        )
+
+    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
+        not cfg.optimizer or "adamw" not in cfg.optimizer
+    ):
+        LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
+
+    if cfg.push_to_hub_model_id:
+        raise ValueError(
+            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
+        )
+
+    if cfg.hub_model_id and cfg.save_strategy not in ["steps", "epoch", None]:
+        LOG.warning(
+            "hub_model_id is set without any models being saved. To save a model, set save_strategy to steps, epochs or leave empty."
+        )
+
+    if cfg.gptq and cfg.revision_of_model:
+        raise ValueError(
+            "revision_of_model is not supported for GPTQ models. "
+            + "Please download the model from HuggingFace Hub manually for correct branch, "
+            + "point to its path, and remove revision_of_model from the config."
+        )
+
+    # if cfg.sample_packing and cfg.sdp_attention:
+    #     # incompatible due to bug w/ accelerate causing 0.0 loss when using llama2
+    #     raise ValueError(
+    #         "sample_packing not compatible with sdp_attention. Use flash_attention"
+    #     )
+
+    if cfg.sample_packing and cfg.xformers_attention:
+        raise ValueError(
+            "sample_packing not compatible with xformers_attention. Use flash_attention"
+        )
+
+    if cfg.sample_packing and cfg.sdp_attention and (cfg.bfloat16 or cfg.bf16):
+        # https://github.com/pytorch/pytorch/blob/1b03423526536b5f3d35bdfa95ccc6197556cf9b/test/test_transformers.py#L2440-L2450
+        LOG.warning(
+            "sample_packing & torch sdpa with bf16 is unsupported may results in 0.0 loss. "
+            "This may work on H100s."
+        )
+
+    if cfg.early_stopping_patience:
+        if not cfg.save_steps or not cfg.eval_steps:
+            raise ValueError(
+                "`early_stopping_patience` requires save_steps and eval_steps to be set. eval_steps should evenly divide save_steps."
+            )
+        if cfg.save_steps % cfg.eval_steps != 0:
+            raise ValueError(
+                "`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
+            )
+
+    if cfg.datasets:
+        for idx, ds_cfg in enumerate(cfg.datasets):
+            if not ds_cfg.type:
+                continue
+            if ds_cfg.type == "sharegpt:chat":
+                LOG.warning(
+                    PendingDeprecationWarning(
+                        "`type: sharegpt:chat` will soon be deprecated. simply use `type: sharegpt` instead."
+                    )
+                )
+                cfg.datasets[idx].type = "sharegpt"
+            if "sharegpt_simple" in ds_cfg.type:
+                LOG.warning(
+                    PendingDeprecationWarning(
+                        "`type: sharegpt_simple` will soon be deprecated. simply use `type: sharegpt` instead."
+                    )
+                )
+                cfg.datasets[idx].type = cfg.datasets[idx].type.replace(
+                    "sharegpt_simple", "sharegpt"
+                )
+
+    if cfg.saves_per_epoch and cfg.save_steps:
+        raise ValueError(
+            "save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
+        )
+    if cfg.save_strategy and cfg.saves_per_epoch and cfg.save_strategy != "steps":
+        raise ValueError(
+            "save_strategy must be empty or set to `steps` when used with saves_per_epoch."
+        )
+    if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
+        raise ValueError(
+            "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
+        )
+    if cfg.evals_per_epoch and cfg.eval_steps:
+        raise ValueError(
+            "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
+        )
+    if (
+        cfg.evals_per_epoch
+        and cfg.evaluation_strategy
+        and cfg.evaluation_strategy != "steps"
+    ):
+        raise ValueError(
+            "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
+        )
+    if (
+        cfg.evaluation_strategy
+        and cfg.eval_steps
+        and cfg.evaluation_strategy != "steps"
+    ):
+        raise ValueError(
+            "evaluation_strategy and eval_steps mismatch. Please set evaluation_strategy to 'steps' or remove eval_steps."
+        )
+
+    if (
+        cfg.val_set_size == 0
+        and (cfg.eval_steps or cfg.evaluation_strategy)
+        and not cfg.test_datasets
+    ):
+        raise ValueError(
+            "eval_steps and evaluation_strategy are not supported with val_set_size == 0"
+        )
+
+    if (
+        cfg.sample_packing
+        and cfg.eval_table_size
+        and cfg.eval_sample_packing is not False
+    ):
+        raise ValueError(
+            "eval_table_size and eval_sample_packing are not supported together with sample_packing. Please set 'eval_sample_packing' to false."
+        )
+
+    if not cfg.adapter and (cfg.load_in_8bit or cfg.load_in_4bit):
+        raise ValueError(
+            "load_in_8bit and load_in_4bit are not supported without setting an adapter."
+            "If you want to full finetune, please turn off load_in_8bit and load_in_4bit."
+        )
+
+    if cfg.rope_scaling:
+        LOG.warning("`rope_scaling` should now be be a key under `model_config`")
+
+    if cfg.wandb_run_id and not cfg.wandb_name:
+        cfg.wandb_name = cfg.wandb_run_id
+
+        LOG.warning(
+            "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
+        )
+
+    if cfg.noisy_embedding_alpha is not None:
+        # Deprecated, use neftune_noise_alpha
+        LOG.warning("noisy_embedding_alpha is deprecated, use neftune_noise_alpha")
+        if cfg.neftune_noise_alpha is None:
+            cfg.neftune_noise_alpha = cfg.noisy_embedding_alpha
+        else:
+            # User is providing both; bail and have them sort out their settings
+            raise ValueError(
+                "noisy_embedding_alpha is deprecated, use neftune_noise_alpha; both are set, please remove the deprecated noisy_embedding_alpha setting"
+            )
+
+    if cfg.neftune_noise_alpha is not None and cfg.neftune_noise_alpha <= 0.0:
+        raise ValueError("neftune_noise_alpha must be > 0.0")
+
+    if cfg.max_memory is not None and cfg.gpu_memory_limit is not None:
+        raise ValueError(
+            "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
+        )
+
+    if (
+        cfg.unfrozen_parameters
+        and cfg.gradient_checkpointing_kwargs
+        and cfg.gradient_checkpointing_kwargs.use_reentrant is True
+    ):
+        # https://github.com/huggingface/transformers/issues/21381
+        raise ValueError(
+            "`use_reentrant` must be false when used with partially frozen model."
+        )
+
+    if cfg.deepspeed and Path(cfg.deepspeed).is_file():
+        with open(cfg.deepspeed, encoding="utf-8") as file:
+            contents = file.read()
+            deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
+            if cfg.flash_attention:
+                if (
+                    deepspeed_cfg.zero_optimization
+                    and deepspeed_cfg.zero_optimization.stage == 3
+                ):
+                    if not (
+                        (
+                            deepspeed_cfg.bf16
+                            and deepspeed_cfg.bf16.enabled  # pylint: disable=no-member
+                            is True
+                        )
+                        or (
+                            deepspeed_cfg.fp16
+                            and deepspeed_cfg.fp16.enabled  # pylint: disable=no-member
+                            is True
+                        )
+                    ):
+                        raise ValueError(
+                            "bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
+                        )
+            if "8bit" in cfg.optimizer and deepspeed_cfg.optimizer:
+                LOG.warning(
+                    f"conflicting optimizer: {cfg.optimizer} used alongside deepspeed optimizer."
+                )
+
+    if cfg.test_datasets and cfg.val_set_size:
+        raise ValueError(
+            "non-zero val_set_size should not be used with test_datasets configuration"
+        )
+
+    if cfg.fsdp and "bnb" in cfg.optimizer:
+        raise ValueError(f"FSDP not compatible with {cfg.optimizer}")
+
+    if cfg.do_causal_lm_eval and cfg.eval_sample_packing:
+        raise ValueError(
+            "do_causal_lm_eval is enabled, eval_sample_packing must be set to False"
+        )
+
+    if cfg.eval_causal_lm_metrics:
+        if not isinstance(cfg.eval_causal_lm_metrics, list):
+            raise ValueError("eval_causal_lm_metrics must be a list")
+        # only ["sacrebleu", "comet", "ter", "chrf"] supported
+        if set(cfg.eval_causal_lm_metrics) - SUPPORTED_METRICS:
+            raise ValueError(
+                f"eval_causal_lm_metrics must be one of {SUPPORTED_METRICS}"
+            )
+
+    # TODO
+    # MPT 7b
+    # https://github.com/facebookresearch/bitsandbytes/issues/25
+    # no 8bit adaAmw w bf16
+
+    # GPT-NeoX
+    # evals broken when extending context len
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward                        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product
+    # attention_mask = causal_mask + attention_mask
+    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -58,7 +58,6 @@ class ChatTemplate(str, Enum):
    qwen_25 = "qwen_25"  # pylint: disable=invalid-name
    tokenizer_default = "tokenizer_default"  # pylint: disable=invalid-name
    exaone = "exaone"  # pylint: disable=invalid-name
-    metharme = "metharme"  # pylint: disable=invalid-name


 class DeprecatedParameters(BaseModel):
@@ -68,7 +67,6 @@ class DeprecatedParameters(BaseModel):
    rope_scaling: Optional[Any] = None
    noisy_embedding_alpha: Optional[float] = None
    dpo_beta: Optional[float] = None
-    evaluation_strategy: Optional[str] = None

    @field_validator("max_packed_sequence_len")
    @classmethod
@@ -100,13 +98,6 @@ class DeprecatedParameters(BaseModel):
            LOG.warning("dpo_beta is deprecated, use rl_beta instead")
        return dpo_beta

-    @field_validator("evaluation_strategy")
-    @classmethod
-    def validate_evaluation_strategy(cls, evaluation_strategy):
-        if evaluation_strategy is not None:
-            LOG.warning("evaluation_strategy is deprecated, use eval_strategy instead")
-        return evaluation_strategy
-

 class RemappedParameters(BaseModel):
    """parameters that have been remapped to other names"""
@@ -250,10 +241,8 @@ class KTODataset(BaseModel):
 class LoftQConfig(BaseModel):
    """LoftQ configuration subset"""

-    loftq_bits: int = Field(
-        default=4, json_schema_extra={"description": "Quantization bits for LoftQ"}
-    )
-    # loftq_iter: int = Field(default=1, json_schema_extra={"description": "Alternating iterations for LoftQ"})
+    loftq_bits: int = Field(default=4, metadata={"help": "Quantization bits for LoftQ"})
+    # loftq_iter: int = Field(default=1, metadata={"help": "Alternating iterations for LoftQ"})


 class PeftConfig(BaseModel):
@@ -296,8 +285,8 @@ class LoraConfig(BaseModel):

    qlora_sharded_model_loading: Optional[bool] = Field(
        default=False,
-        json_schema_extra={
-            "description": "load qlora model in sharded format for FSDP using answer.ai technique."
+        metadata={
+            "help": "load qlora model in sharded format for FSDP using answer.ai technique."
        },
    )
    lora_on_cpu: Optional[bool] = None
@@ -306,15 +295,13 @@ class LoraConfig(BaseModel):

    loraplus_lr_ratio: Optional[float] = Field(
        default=None,
-        json_schema_extra={
-            "description": "loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4."
+        metadata={
+            "help": "loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4."
        },
    )
    loraplus_lr_embedding: Optional[float] = Field(
        default=1e-6,
-        json_schema_extra={
-            "description": "loraplus learning rate for lora embedding layers."
-        },
+        metadata={"help": "loraplus learning rate for lora embedding layers."},
    )

    merge_lora: Optional[bool] = None
@@ -384,10 +371,10 @@ class ModelInputConfig(BaseModel):
    tokenizer_use_fast: Optional[bool] = None
    tokenizer_legacy: Optional[bool] = None
    tokenizer_type: Optional[str] = Field(
-        default=None, json_schema_extra={"description": "transformers tokenizer class"}
+        default=None, metadata={"help": "transformers tokenizer class"}
    )
    processor_type: Optional[str] = Field(
-        default=None, json_schema_extra={"description": "transformers processor class"}
+        default=None, metadata={"help": "transformers processor class"}
    )
    trust_remote_code: Optional[bool] = None

@@ -409,18 +396,18 @@ class HyperparametersConfig(BaseModel):
    gradient_accumulation_steps: Optional[int] = Field(default=1)
    micro_batch_size: Optional[int] = Field(
        default=1,
-        json_schema_extra={"description": "per gpu micro batch size for training"},
+        metadata={"help": "per gpu micro batch size for training"},
    )
    batch_size: Optional[int] = Field(
        default=None,
-        json_schema_extra={
-            "description": "Total batch size, we do not recommended setting this manually"
+        metadata={
+            "help": "Total batch size, we do not recommended setting this manually"
        },
    )
    eval_batch_size: Optional[int] = Field(
        default=None,
-        json_schema_extra={
-            "description": "per gpu micro batch size for evals, defaults to value of micro_batch_size"
+        metadata={
+            "help": "per gpu micro batch size for evals, defaults to value of micro_batch_size"
        },
    )

@@ -440,20 +427,23 @@ class HyperparametersConfig(BaseModel):
                "ao_adamw_4bit",
                "ao_adamw_8bit",
                "ao_adamw_fp8",
-                "adopt_adamw",
+                "soap",
            ],
        ]
    ] = OptimizerNames.ADAMW_HF.value
    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
-        default=None,
-        json_schema_extra={"description": "Optional arguments to supply to optimizer."},
+        default=None, metadata={"help": "Optional arguments to supply to optimizer."}
    )
    optim_target_modules: Optional[Union[List[str], Literal["all_linear"]]] = Field(
        default=None,
-        json_schema_extra={
-            "description": "The target modules to optimize, i.e. the module names that you would like to train."
+        metadata={
+            "help": "The target modules to optimize, i.e. the module names that you would like to train."
        },
    )
+
+    optim_soap_beta1: Optional[float] = None
+    optim_soap_beta2: Optional[float] = None
+
    torchdistx_path: Optional[str] = None
    lr_scheduler: Optional[Union[SchedulerType, Literal["one_cycle"]]] = "cosine"
    lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
@@ -511,15 +501,15 @@ class LISAConfig(BaseModel):

    lisa_n_layers: Optional[int] = Field(
        default=None,
-        json_schema_extra={"description": "the number of activate layers in LISA"},
+        metadata={"help": "the number of activate layers in LISA"},
    )
    lisa_step_interval: Optional[int] = Field(
        default=None,
-        json_schema_extra={"description": "how often to switch layers in LISA"},
+        metadata={"help": "how often to switch layers in LISA"},
    )
    lisa_layers_attribute: Optional[str] = Field(
        default="model.layers",
-        json_schema_extra={"description": "path under the model to access the layers"},
+        metadata={"help": "path under the model to access the layers"},
    )


@@ -603,9 +593,6 @@ class AxolotlInputConfig(

    rl: Optional[RLType] = None
    reward_model: Optional[bool] = None
-    dpo_use_weighting: Optional[
-        bool
-    ] = None  # whether to use weighting in DPO trainer. If none, default is false in the trainer.

    datasets: Optional[conlist(Union[SFTDataset, DPODataset, KTODataset], min_length=1)] = None  # type: ignore
    test_datasets: Optional[conlist(Union[SFTDataset, DPODataset, KTODataset], min_length=1)] = None  # type: ignore
@@ -618,8 +605,7 @@ class AxolotlInputConfig(
    pretraining_dataset: Optional[  # type: ignore
        conlist(Union[PretrainingDataset, SFTDataset], min_length=1)
    ] = Field(
-        default=None,
-        json_schema_extra={"description": "streaming dataset to use for pretraining"},
+        default=None, metadata={"help": {"streaming dataset to use for pretraining"}}
    )
    dataset_processes: Optional[int] = Field(default=os.cpu_count())
    dataset_keep_in_memory: Optional[bool] = None
@@ -679,8 +665,7 @@ class AxolotlInputConfig(
    sequence_len: int = Field(default=512)
    min_sample_len: Optional[int] = None
    max_prompt_len: int = Field(
-        default=512,
-        json_schema_extra={"description": "maximum prompt length for RL training"},
+        default=512, metadata={"help": "maximum prompt length for RL training"}
    )
    sample_packing: Optional[bool] = None
    sample_packing_group_size: Optional[int] = 100_000
@@ -699,8 +684,8 @@ class AxolotlInputConfig(
    pretrain_multipack_buffer_size: Optional[int] = 10_000
    pretrain_multipack_attn: Optional[bool] = Field(
        default=True,
-        json_schema_extra={
-            "description": "whether to prevent cross attention for packed sequences during pretraining",
+        metadata={
+            "help": "whether to prevent cross attention for packed sequences during pretraining",
        },
    )

@@ -746,7 +731,7 @@ class AxolotlInputConfig(
    warmup_ratio: Optional[float] = None
    eval_steps: Optional[Union[int, float]] = None
    evals_per_epoch: Optional[Union[int]] = None
-    eval_strategy: Optional[str] = None
+    evaluation_strategy: Optional[str] = None
    save_steps: Optional[Union[int, float]] = None
    saves_per_epoch: Optional[int] = None
    save_strategy: Optional[str] = None
@@ -798,25 +783,28 @@ class AxolotlInputConfig(
    is_mistral_derived_model: Optional[bool] = Field(default=None)
    is_qwen_derived_model: Optional[bool] = Field(default=None)

-    plugins: Optional[List[str]] = Field(default=None)
-
    @field_validator("datasets", mode="before")
    @classmethod
-    def deprecate_sharegpt_datasets(cls, datasets):
-        for _, ds_cfg in enumerate(datasets):
-            if not ds_cfg.get("type"):
+    def fix_sharegpt_datasets(cls, datasets):
+        for idx, ds_cfg in enumerate(datasets):
+            if not ds_cfg["type"]:
                continue
-
-            ds_type = ds_cfg["type"]
-            # skip if it's a dict (for custom user instruction prompt)
-            if isinstance(ds_type, dict):
-                continue
-
-            if isinstance(ds_type, str) and ds_type.startswith("sharegpt"):
-                raise ValueError(
-                    "`type: sharegpt.*` is deprecated. Please use `type: chat_template` instead."
+            if ds_cfg["type"] == "sharegpt:chat":
+                LOG.warning(
+                    PendingDeprecationWarning(
+                        "`type: sharegpt:chat` will soon be deprecated. simply use `type: sharegpt` instead."
+                    )
+                )
+                datasets[idx]["type"] = "sharegpt"
+            if "sharegpt_simple" in ds_cfg["type"]:
+                LOG.warning(
+                    PendingDeprecationWarning(
+                        "`type: sharegpt_simple` will soon be deprecated. simply use `type: sharegpt` instead."
+                    )
+                )
+                datasets[idx]["type"] = datasets[idx]["type"].replace(
+                    "sharegpt_simple", "sharegpt"
                )
-
        return datasets

    @model_validator(mode="before")
@@ -1048,21 +1036,21 @@ class AxolotlInputConfig(
    @classmethod
    def check_evals(cls, data):
        if (
-            data.get("eval_strategy")
+            data.get("evaluation_strategy")
            and data.get("eval_steps")
-            and data.get("eval_strategy") != "steps"
+            and data.get("evaluation_strategy") != "steps"
        ):
            raise ValueError(
-                "eval_strategy and eval_steps mismatch. Please set eval_strategy to 'steps' or remove eval_steps."
+                "evaluation_strategy and eval_steps mismatch. Please set evaluation_strategy to 'steps' or remove eval_steps."
            )

        if (
            data.get("val_set_size") == 0
-            and (data.get("eval_steps") or data.get("eval_strategy"))
+            and (data.get("eval_steps") or data.get("evaluation_strategy"))
            and not data.get("test_datasets")
        ):
            raise ValueError(
-                "eval_steps and eval_strategy are not supported with val_set_size == 0"
+                "eval_steps and evaluation_strategy are not supported with val_set_size == 0"
            )
        if data.get("evals_per_epoch") and data.get("eval_steps"):
            raise ValueError(
@@ -1070,11 +1058,11 @@ class AxolotlInputConfig(
            )
        if (
            data.get("evals_per_epoch")
-            and data.get("eval_strategy")
-            and data.get("eval_strategy") != "steps"
+            and data.get("evaluation_strategy")
+            and data.get("evaluation_strategy") != "steps"
        ):
            raise ValueError(
-                "eval_strategy must be empty or set to `steps` when used with evals_per_epoch."
+                "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
            )

        if data.get("do_bench_eval") and not (
@@ -1306,25 +1294,6 @@ class AxolotlInputConfig(
            )
        return data

-    @model_validator(mode="before")
-    @classmethod
-    def warn_qlora_zero3_w_use_reentrant(cls, data):
-        if (
-            data.get("adapter") == "qlora"
-            and data.get("gradient_checkpointing_kwargs", {})
-            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
-            is False
-            and "zero3" in data.get("deepspeed", "")
-        ):
-            # may result in:
-            # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
-            # Recomputed values for the following tensors have different metadata
-            # than during the forward pass.
-            LOG.warning(
-                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
-            )
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_val_w_test_datasets(cls, data):
@@ -1334,19 +1303,6 @@ class AxolotlInputConfig(
            )
        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_eval_strategy(cls, data):
-        if (
-            data.get("evaluation_strategy") is not None
-            and data.get("eval_strategy") is None
-        ):
-            LOG.info(
-                "explicitly setting `eval_strategy` from the `evaluation_strategy`"
-            )
-            data["eval_strategy"] = data.get("evaluation_strategy")
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_fsdp_offload_w_8bit_optimizer(cls, data):
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -64,57 +64,15 @@ def map_dataset(cfg, data_set, ds_transform_fn, tokenizer):
            tokenizer = load_tokenizer(cfg)
        ds_transform_fn = partial(ds_transform_fn, tokenizer=tokenizer)

-    if isinstance(data_set, DatasetDict):
-        data_set = data_set["train"]
-
    data_set = data_set.map(
        ds_transform_fn,
        desc="Mapping RL Dataset",
    )
-
+    if isinstance(data_set, DatasetDict):
+        data_set = data_set["train"]
    return data_set


-def drop_long_rl_seq(
-    sample, rl, tokenizer, sequence_len  # pylint: disable=invalid-name
-):
-    if rl in ("dpo", "ipo", "orpo", "simpo"):
-        if not (
-            sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
-        ):
-            raise ValueError(
-                "Prompt, chosen and rejected keys are required for DPO/ORPO datasets"
-            )
-
-        prompt = sample["prompt"]
-        chosen = sample["chosen"]
-        rejected = sample["rejected"]
-
-        len_prompt = len(tokenizer(prompt, add_special_tokens=False)["input_ids"])
-        len_chosen = len(tokenizer(chosen, add_special_tokens=False)["input_ids"])
-        len_rejected = len(tokenizer(rejected, add_special_tokens=False)["input_ids"])
-
-        return (len_prompt + len_chosen) <= sequence_len and (
-            len_prompt + len_rejected
-        ) <= sequence_len
-
-    if rl == "kto":
-        if not (sample.get("prompt") and sample.get("completion")):
-            raise ValueError("Prompt and completion keys are required for KTO datasets")
-
-        prompt = sample["prompt"]
-        completion = sample["completion"]
-
-        len_prompt = len(tokenizer(prompt, add_special_tokens=False)["input_ids"])
-        len_completion = len(
-            tokenizer(completion, add_special_tokens=False)["input_ids"]
-        )
-
-        return (len_prompt + len_completion) <= sequence_len
-
-    raise ValueError("Unknown RL type")
-
-
 def load_prepare_dpo_datasets(cfg):
    def load_split(dataset_cfgs, _cfg):
        split_datasets: List[Any] = []
@@ -136,7 +94,7 @@ def load_prepare_dpo_datasets(cfg):
                )
                split_datasets.insert(i, ds)

-        tokenizer = load_tokenizer(cfg)
+        tokenizer = None

        for i, data_set in enumerate(split_datasets):
            _type = dataset_cfgs[i]["type"]
@@ -163,28 +121,7 @@ def load_prepare_dpo_datasets(cfg):
                # "prompt", "chosen" and "rejected" already preprocessed
                split_datasets[i] = data_set

-            drop_long = partial(
-                drop_long_rl_seq,
-                rl=_cfg.rl,
-                tokenizer=tokenizer,
-                sequence_len=cfg.sequence_len,
-            )
-
-            prior_len = len(split_datasets[i])
-            split_datasets[i] = split_datasets[i].filter(
-                drop_long,
-                num_proc=cfg.dataset_processes,
-                load_from_cache_file=not cfg.is_preprocess,
-                desc="Dropping Long Sequences",
-            )
-            dropped = prior_len - len(split_datasets[i])
-            if dropped:
-                LOG.warning(f"Dropped {dropped} long samples from dataset index {i}")
-
-        combined_datasets = concatenate_datasets(split_datasets)
-        combined_datasets = combined_datasets.shuffle(seed=cfg.seed)
-
-        return combined_datasets
+        return concatenate_datasets(split_datasets)

    with zero_first(is_main_process()):
        train_is_preprocessed = False
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -2,11 +2,9 @@

 import functools
 import logging
-import time
 from pathlib import Path
 from typing import List, Optional, Tuple, Union

-import requests
 from datasets import (
    Dataset,
    DatasetDict,
@@ -55,28 +53,6 @@ from axolotl.utils.trainer import (
 LOG = logging.getLogger("axolotl")


-def retry_on_request_exceptions(max_retries=3, delay=1):
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):  # pylint: disable=inconsistent-return-statements
-            for attempt in range(max_retries):
-                try:
-                    return func(*args, **kwargs)
-                except (
-                    requests.exceptions.ReadTimeout,
-                    requests.exceptions.ConnectionError,
-                ) as exc:
-                    if attempt < max_retries - 1:
-                        time.sleep(delay)
-                    else:
-                        raise exc
-
-        return wrapper
-
-    return decorator
-
-
-@retry_on_request_exceptions(max_retries=3, delay=5)
 def prepare_dataset(cfg, tokenizer, processor=None):
    prompters = []
    if not cfg.pretraining_dataset:
@@ -260,7 +236,6 @@ def load_tokenized_prepared_datasets(
        for config_dataset in for_d_in_datasets(cfg_datasets):
            ds: Optional[Union[Dataset, DatasetDict]] = None
            ds_from_hub = False
-            ds_trust_remote_code = config_dataset.trust_remote_code
            try:
                # this is just a basic check to see if the path is a
                # valid HF dataset that's loadable
@@ -270,7 +245,6 @@ def load_tokenized_prepared_datasets(
                    streaming=True,
                    token=use_auth_token,
                    revision=config_dataset.revision,
-                    trust_remote_code=ds_trust_remote_code,
                )
                ds_from_hub = True
            except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
@@ -350,15 +324,7 @@ def load_tokenized_prepared_datasets(
                            split=None,
                        )
                    else:
-                        try:
-                            ds = load_from_disk(config_dataset.path)
-                        except FileNotFoundError:
-                            ds = load_dataset(
-                                config_dataset.path,
-                                name=config_dataset.name,
-                                streaming=False,
-                                split=None,
-                            )
+                        ds = load_from_disk(config_dataset.path)
                elif local_path.is_file():
                    ds_type = get_ds_type(config_dataset)

@@ -376,7 +342,7 @@ def load_tokenized_prepared_datasets(
            elif ds_from_hub:
                load_ds_kwargs = {}
                if config_dataset.split:
-                    load_ds_kwargs["split"] = config_dataset.split
+                    load_ds_kwargs = {"split": config_dataset.split}
                ds = load_dataset(
                    config_dataset.path,
                    name=config_dataset.name,
@@ -384,7 +350,6 @@ def load_tokenized_prepared_datasets(
                    data_files=config_dataset.data_files,
                    token=use_auth_token,
                    revision=config_dataset.revision,
-                    trust_remote_code=config_dataset.trust_remote_code,
                    **load_ds_kwargs,
                )
            elif ds_from_cloud and remote_file_system:
@@ -402,7 +367,6 @@ def load_tokenized_prepared_datasets(
                        streaming=False,
                        split=None,
                        storage_options=storage_options,
-                        trust_remote_code=config_dataset.trust_remote_code,
                    )
            elif config_dataset.path.startswith("https://"):
                ds_type = get_ds_type(config_dataset)
@@ -413,7 +377,6 @@ def load_tokenized_prepared_datasets(
                    streaming=False,
                    split=None,
                    storage_options=storage_options,
-                    trust_remote_code=config_dataset.trust_remote_code,
                )
            else:
                if isinstance(config_dataset.data_files, str):
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -1,25 +0,0 @@
-"""
-utils to get GPU info for the current environment
-"""
-from accelerate.utils.environment import (
-    check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
-)
-from accelerate.utils.environment import get_gpu_info
-
-
-def check_cuda_p2p_ib_support():
-    if not accelerate_check_cuda_p2p_ib_support():
-        return False
-    unsupported_devices = {"RTX 6000 Ada"}
-    try:
-        device_names, device_count = get_gpu_info()
-        if 1 < device_count < 8:
-            if any(
-                unsupported_device in device_name
-                for device_name in device_names
-                for unsupported_device in unsupported_devices
-            ):
-                return False
-    except Exception:  # pylint: disable=broad-except # nosec
-        pass
-    return True
--- a/src/axolotl/utils/gradient_checkpointing/unsloth.py
+++ b/src/axolotl/utils/gradient_checkpointing/unsloth.py
@@ -14,16 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import torch
-from packaging import version
-
-torch_version = version.parse(torch.__version__)
-
-if torch_version < version.parse("2.4.0"):
-    torch_cuda_amp_custom_fwd = torch.cuda.amp.custom_fwd
-    torch_cuda_amp_custom_bwd = torch.cuda.amp.custom_bwd
-else:
-    torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
-    torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")


 class Unsloth_Offloaded_Gradient_Checkpointer(  # pylint: disable=invalid-name
@@ -35,7 +25,7 @@ class Unsloth_Offloaded_Gradient_Checkpointer(  # pylint: disable=invalid-name
    """

    @staticmethod
-    @torch_cuda_amp_custom_fwd
+    @torch.cuda.amp.custom_fwd
    def forward(ctx, forward_function, hidden_states, *args):
        saved_hidden_states = hidden_states.to("cpu", non_blocking=True)
        with torch.no_grad():
@@ -46,7 +36,7 @@ class Unsloth_Offloaded_Gradient_Checkpointer(  # pylint: disable=invalid-name
        return output

    @staticmethod
-    @torch_cuda_amp_custom_bwd
+    @torch.cuda.amp.custom_bwd
    def backward(ctx, dY):
        (hidden_states,) = ctx.saved_tensors
        hidden_states = hidden_states.to("cuda", non_blocking=True).detach()
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -238,7 +238,6 @@ def load_tokenizer(cfg):
                        x in cfg.lora_modules_to_save for x in lora_modules_to_save
                    )
                )
-                and k != "pad_token"
            ):
                lora_modules_to_save = ", ".join(
                    [f"`{x}`" for x in lora_modules_to_save]
@@ -395,17 +394,10 @@ class ModelLoader:
            and self.cfg.flash_attention
            and self.cfg.sample_packing
        ):
-            has_remote_code = (
-                "auto_map" in self.model_config
-                and "AutoModelForCausalLM" in self.model_config["auto_map"]
-            )
-            if has_remote_code and self.cfg.trust_remote_code is False:
-                # if explicitly set in the YAML, we should prefer that, for example if explicitly disabled
-                has_remote_code = self.cfg.trust_remote_code
            patch_for_multipack(
                self.cfg.model_config_type,
                model_name=self.cfg.base_model,
-                has_remote_code=has_remote_code,
+                is_remote_code=self.cfg.trust_remote_code,
            )

            if self.cfg.is_llama_derived_model:
--- a/src/axolotl/utils/optimizers/init.py
+++ b/src/axolotl/utils/optimizers/init.py
--- a/src/axolotl/utils/optimizers/adopt.py
+++ b/src/axolotl/utils/optimizers/adopt.py
@@ -1,508 +0,0 @@
-"""
-Copied from https://github.com/iShohei220/adopt
-
-ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)
-Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka
-"""
-# mypy: ignore-errors
-# pylint: skip-file
-# mypy: allow-untyped-decorators
-# mypy: allow-untyped-defs
-from typing import List, Optional, Tuple, Union, cast
-
-import torch
-from torch import Tensor
-from torch.optim.optimizer import (
-    Optimizer,
-    ParamsT,
-    _default_to_fused_or_foreach,
-    _device_dtype_check_for_fused,
-    _disable_dynamo_if_unsupported,
-    _get_capturable_supported_devices,
-    _get_scalar_dtype,
-    _get_value,
-    _use_grad_for_differentiable,
-    _view_as_real,
-)
-
-__all__ = ["ADOPT", "adopt"]
-
-
-class ADOPT(Optimizer):
-    def __init__(
-        self,
-        params: ParamsT,
-        lr: Union[float, Tensor] = 1e-3,
-        betas: Tuple[float, float] = (0.9, 0.9999),
-        eps: float = 1e-6,
-        weight_decay: float = 0.0,
-        decoupled: bool = False,
-        *,
-        foreach: Optional[bool] = None,
-        maximize: bool = False,
-        capturable: bool = False,
-        differentiable: bool = False,
-        fused: Optional[bool] = None,
-    ):
-        if isinstance(lr, Tensor):
-            if foreach and not capturable:
-                raise ValueError(
-                    "lr as a Tensor is not supported for capturable=False and foreach=True"
-                )
-            if lr.numel() != 1:
-                raise ValueError("Tensor lr must be 1-element")
-        if not 0.0 <= lr:
-            raise ValueError(f"Invalid learning rate: {lr}")
-        if not 0.0 <= eps:
-            raise ValueError(f"Invalid epsilon value: {eps}")
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
-        if not 0.0 <= weight_decay:
-            raise ValueError(f"Invalid weight_decay value: {weight_decay}")
-
-        defaults = dict(
-            lr=lr,
-            betas=betas,
-            eps=eps,
-            weight_decay=weight_decay,
-            decoupled=decoupled,
-            maximize=maximize,
-            foreach=foreach,
-            capturable=capturable,
-            differentiable=differentiable,
-            fused=fused,
-        )
-        super().__init__(params, defaults)
-
-        if fused:
-            # TODO: support fused
-            raise RuntimeError("`fused` is not currently supported")
-
-            if differentiable:
-                raise RuntimeError("`fused` does not support `differentiable`")
-            self._step_supports_amp_scaling = True
-            # TODO(crcrpar): [low prec params & their higher prec copy]
-            # Support AMP with FP16/BF16 model params which would need
-            # higher prec copy of params to do update math in higher prec to
-            # alleviate the loss of information.
-            if foreach:
-                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
-
-    def __setstate__(self, state):
-        super().__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault("maximize", False)
-            group.setdefault("foreach", None)
-            group.setdefault("capturable", False)
-            group.setdefault("differentiable", False)
-            fused = group.setdefault("fused", None)
-            for p in group["params"]:
-                p_state = self.state.get(p, [])
-                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
-                    step_val = float(p_state["step"])
-                    p_state["step"] = (
-                        torch.tensor(
-                            step_val,
-                            dtype=_get_scalar_dtype(is_fused=fused),
-                            device=p.device,
-                        )
-                        if group["capturable"] or group["fused"]
-                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
-                    )
-
-    def _init_group(
-        self,
-        group,
-        params_with_grad,
-        grads,
-        exp_avgs,
-        exp_avg_sqs,
-        state_steps,
-    ):
-        has_complex = False
-        for p in group["params"]:
-            if p.grad is not None:
-                has_complex |= torch.is_complex(p)
-                params_with_grad.append(p)
-                if p.grad.is_sparse:
-                    raise RuntimeError("ADOPT does not support sparse gradients")
-                grads.append(p.grad)
-
-                state = self.state[p]
-                # Lazy state initialization
-                if len(state) == 0:
-                    if group["fused"]:
-                        _device_dtype_check_for_fused(p)
-                    # note(crcrpar): [special device hosting for step]
-                    # Deliberately host `step` on CPU if both capturable and fused are off.
-                    # This is because kernel launches are costly on CUDA and XLA.
-                    state["step"] = (
-                        torch.zeros(
-                            (),
-                            dtype=_get_scalar_dtype(is_fused=group["fused"]),
-                            device=p.device,
-                        )
-                        if group["capturable"] or group["fused"]
-                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
-                    )
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(
-                        p, memory_format=torch.preserve_format
-                    )
-
-                exp_avgs.append(state["exp_avg"])
-                exp_avg_sqs.append(state["exp_avg_sq"])
-
-                if group["differentiable"] and state["step"].requires_grad:
-                    raise RuntimeError(
-                        "`requires_grad` is not supported for `step` in differentiable mode"
-                    )
-
-                # Foreach without capturable does not support a tensor lr
-                if (
-                    group["foreach"]
-                    and torch.is_tensor(group["lr"])
-                    and not group["capturable"]
-                ):
-                    raise RuntimeError(
-                        "lr as a Tensor is not supported for capturable=False and foreach=True"
-                    )
-
-                state_steps.append(state["step"])
-        return has_complex
-
-    @_use_grad_for_differentiable
-    def step(self, closure=None):
-        """Perform a single optimization step.
-
-        Args:
-            closure (Callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        self._cuda_graph_capture_health_check()
-
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            params_with_grad: List[Tensor] = []
-            grads: List[Tensor] = []
-            exp_avgs: List[Tensor] = []
-            exp_avg_sqs: List[Tensor] = []
-            state_steps: List[Tensor] = []
-            beta1, beta2 = group["betas"]
-
-            has_complex = self._init_group(
-                group,
-                params_with_grad,
-                grads,
-                exp_avgs,
-                exp_avg_sqs,
-                state_steps,
-            )
-
-            adopt(
-                params_with_grad,
-                grads,
-                exp_avgs,
-                exp_avg_sqs,
-                state_steps,
-                has_complex=has_complex,
-                beta1=beta1,
-                beta2=beta2,
-                lr=group["lr"],
-                weight_decay=group["weight_decay"],
-                decoupled=group["decoupled"],
-                eps=group["eps"],
-                maximize=group["maximize"],
-                foreach=group["foreach"],
-                capturable=group["capturable"],
-                differentiable=group["differentiable"],
-                fused=group["fused"],
-                grad_scale=getattr(self, "grad_scale", None),
-                found_inf=getattr(self, "found_inf", None),
-            )
-
-        return loss
-
-
-def _single_tensor_adopt(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    state_steps: List[Tensor],
-    grad_scale: Optional[Tensor],
-    found_inf: Optional[Tensor],
-    *,
-    has_complex: bool,
-    beta1: float,
-    beta2: float,
-    lr: Union[float, Tensor],
-    weight_decay: float,
-    decoupled: bool,
-    eps: float,
-    maximize: bool,
-    capturable: bool,
-    differentiable: bool,
-):
-    assert grad_scale is None and found_inf is None
-
-    if torch.jit.is_scripting():
-        # this assert is due to JIT being dumb and not realizing that the ops below
-        # have overloads to handle both float and Tensor lrs, so we just assert it's
-        # a float since most people using JIT are using floats
-        assert isinstance(lr, float)
-
-    for i, param in enumerate(params):
-        grad = grads[i] if not maximize else -grads[i]
-        exp_avg = exp_avgs[i]
-        exp_avg_sq = exp_avg_sqs[i]
-        step_t = state_steps[i]
-
-        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
-        if not torch._utils.is_compiling() and capturable:
-            capturable_supported_devices = _get_capturable_supported_devices()
-            assert (
-                param.device.type == step_t.device.type
-                and param.device.type in capturable_supported_devices
-            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
-
-        # update step
-        step_t += 1
-
-        if weight_decay != 0:
-            if decoupled:
-                param.add_(param, alpha=-lr * weight_decay)
-            else:
-                grad = grad.add(param, alpha=weight_decay)
-
-        if torch.is_complex(param):
-            grad = torch.view_as_real(grad)
-            if exp_avg is not None:
-                exp_avg = torch.view_as_real(exp_avg)
-            if exp_avg_sq is not None:
-                exp_avg_sq = torch.view_as_real(exp_avg_sq)
-            param = torch.view_as_real(param)
-
-        step = step_t if capturable or differentiable else _get_value(step_t)
-        if step == 1:
-            exp_avg_sq.addcmul_(grad, grad.conj())
-            continue
-
-        denom = torch.clamp(exp_avg_sq.sqrt(), eps)
-        if step == 2:
-            exp_avg.addcdiv_(grad, denom)
-        else:
-            exp_avg.mul_(beta1).addcdiv_(grad, denom, value=1 - beta1)
-
-        param.add_(exp_avg, alpha=-lr)
-        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)
-
-
-def _multi_tensor_adopt(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    state_steps: List[Tensor],
-    grad_scale: Optional[Tensor],
-    found_inf: Optional[Tensor],
-    *,
-    has_complex: bool,
-    beta1: float,
-    beta2: float,
-    lr: Union[float, Tensor],
-    weight_decay: float,
-    decoupled: bool,
-    eps: float,
-    maximize: bool,
-    capturable: bool,
-    differentiable: bool,
-):
-    if len(params) == 0:
-        return
-
-    if isinstance(lr, Tensor) and not capturable:
-        raise RuntimeError(
-            "lr as a Tensor is not supported for capturable=False and foreach=True"
-        )
-
-    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
-    if not torch._utils.is_compiling() and capturable:
-        capturable_supported_devices = _get_capturable_supported_devices(
-            supports_xla=False
-        )
-        assert all(
-            p.device.type == step.device.type
-            and p.device.type in capturable_supported_devices
-            for p, step in zip(params, state_steps)
-        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
-
-    assert grad_scale is None and found_inf is None
-
-    assert not differentiable, "_foreach ops don't support autograd"
-
-    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
-        [params, grads, exp_avgs, exp_avg_sqs, state_steps]  # type: ignore[list-item]
-    )
-    for (
-        device_params_,
-        device_grads_,
-        device_exp_avgs_,
-        device_exp_avg_sqs_,
-        device_state_steps_,
-    ), _ in grouped_tensors.values():
-        device_params = cast(List[Tensor], device_params_)
-        device_grads = cast(List[Tensor], device_grads_)
-        device_exp_avgs = cast(List[Tensor], device_exp_avgs_)
-        device_exp_avg_sqs = cast(List[Tensor], device_exp_avg_sqs_)
-        device_state_steps = cast(List[Tensor], device_state_steps_)
-
-        # Handle complex parameters
-        if has_complex:
-            _view_as_real(
-                device_params, device_grads, device_exp_avgs, device_exp_avg_sqs
-            )
-
-        if maximize:
-            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]
-
-        # Update steps
-        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
-        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
-        # wrapped it once now. The alpha is required to assure we go to the right overload.
-        if not torch._utils.is_compiling() and device_state_steps[0].is_cpu:
-            torch._foreach_add_(
-                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
-            )
-        else:
-            torch._foreach_add_(device_state_steps, 1)
-
-        if weight_decay != 0:
-            if decoupled:
-                torch._foreach_add_(
-                    device_params, device_params, alpha=-lr * weight_decay
-                )
-            else:
-                # Re-use the intermediate memory (device_grads) already allocated for maximize
-                if maximize:
-                    torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
-                else:
-                    device_grads = torch._foreach_add(  # type: ignore[assignment]
-                        device_grads, device_params, alpha=weight_decay
-                    )
-
-        if device_state_steps[0] == 1:
-            torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads)
-            continue
-
-        exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
-        exp_avg_sq_sqrt = torch._foreach_maximum(exp_avg_sq_sqrt, eps)
-
-        if device_state_steps[0] == 2:
-            torch._foreach_addcdiv_(device_exp_avgs, device_grads, exp_avg_sq_sqrt)
-        else:
-            torch._foreach_mul_(device_exp_avgs, beta1)
-            torch._foreach_addcdiv_(
-                device_exp_avgs, device_grads, exp_avg_sq_sqrt, value=1 - beta1
-            )
-
-        torch._foreach_add_(device_params, device_exp_avgs, alpha=-lr)
-        torch._foreach_mul_(device_exp_avg_sqs, beta2)
-        torch._foreach_addcmul_(
-            device_exp_avg_sqs, device_grads, device_grads, value=1 - beta2
-        )
-
-
-@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adopt)
-def adopt(
-    params: List[Tensor],
-    grads: List[Tensor],
-    exp_avgs: List[Tensor],
-    exp_avg_sqs: List[Tensor],
-    state_steps: List[Tensor],
-    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
-    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
-    foreach: Optional[bool] = None,
-    capturable: bool = False,
-    differentiable: bool = False,
-    fused: Optional[bool] = None,
-    grad_scale: Optional[Tensor] = None,
-    found_inf: Optional[Tensor] = None,
-    has_complex: bool = False,
-    *,
-    beta1: float,
-    beta2: float,
-    lr: Union[float, Tensor],
-    weight_decay: float,
-    decoupled: bool,
-    eps: float,
-    maximize: bool,
-):
-    r"""Functional API that performs ADOPT algorithm computation."""
-    # Respect when the user inputs False/True for foreach or fused. We only want to change
-    # the default when neither have been user-specified. Note that we default to foreach
-    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
-    # bake-in time before making it the default, even if it is typically faster.
-    if fused is None and foreach is None:
-        _, foreach = _default_to_fused_or_foreach(
-            params, differentiable, use_fused=False
-        )
-        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
-        if foreach and isinstance(lr, Tensor) and not capturable:
-            foreach = False
-    if fused is None:
-        fused = False
-    if foreach is None:
-        foreach = False
-
-    # this check is slow during compilation, so we skip it
-    # if it's strictly needed we can add this check back in dynamo
-    if not torch._utils.is_compiling() and not all(
-        isinstance(t, torch.Tensor) for t in state_steps
-    ):
-        raise RuntimeError(
-            "API has changed, `state_steps` argument must contain a list of singleton tensors"
-        )
-
-    if foreach and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
-    if fused and torch.jit.is_scripting():
-        raise RuntimeError("torch.jit.script not supported with fused optimizers")
-
-    # if fused and not torch.jit.is_scripting():
-    #     func = _fused_adopt
-    # elif foreach and not torch.jit.is_scripting():
-    if foreach and not torch.jit.is_scripting():
-        func = _multi_tensor_adopt
-    else:
-        func = _single_tensor_adopt
-
-    func(
-        params,
-        grads,
-        exp_avgs,
-        exp_avg_sqs,
-        state_steps,
-        has_complex=has_complex,
-        beta1=beta1,
-        beta2=beta2,
-        lr=lr,
-        weight_decay=weight_decay,
-        decoupled=decoupled,
-        eps=eps,
-        maximize=maximize,
-        capturable=capturable,
-        differentiable=differentiable,
-        grad_scale=grad_scale,
-        found_inf=found_inf,
-    )
--- a/src/axolotl/utils/optimizers/soap/LICENSE
+++ b/src/axolotl/utils/optimizers/soap/LICENSE
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
+Copyright (c) 2024 Nikhil Vyas

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/src/axolotl/utils/optimizers/soap/init.py
+++ b/src/axolotl/utils/optimizers/soap/init.py
@@ -0,0 +1,475 @@
+# pylint: skip-file
+# Copied from https://github.com/nikhilvyas/SOAP
+from itertools import chain
+
+import torch
+import torch.optim as optim
+
+# Parts of the code are modifications of Pytorch's AdamW optimizer
+# Parts of the code are modifications of code from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/galore_projector.py
+
+
+class SOAP(optim.Optimizer):
+    """
+    Implements SOAP algorithm (https://arxiv.org/abs/2409.11321).
+
+    Parameters:
+        params (`Iterable[nn.parameter.Parameter]`):
+            Iterable of parameters to optimize or dictionaries defining parameter groups.
+        lr (`float`, *optional*, defaults to 0.003):
+            The learning rate to use.
+        betas (`Tuple[float,float]`, *optional*, defaults to `(0.95, 0.95)`):
+            Adam's betas parameters (b1, b2).
+        shampoo_beta (`float`, *optional*, defaults to -1):
+            If >= 0, use this beta for the preconditioner (L and R in paper, state['GG'] below) moving average instead of betas[1].
+        eps (`float`, *optional*, defaults to 1e-08):
+            Adam's epsilon for numerical stability.
+        weight_decay (`float`, *optional*, defaults to 0.01): weight decay coefficient.
+        precondition_frequency (`int`, *optional*, defaults to 10):
+            How often to update the preconditioner.
+        max_precond_dim (`int`, *optional*, defaults to 10000):
+            Maximum dimension of the preconditioner.
+            Set to 10000, so that we exclude most common vocab sizes while including layers.
+        merge_dims (`bool`, *optional*, defaults to `False`):
+            Whether or not to merge dimensions of the preconditioner.
+        precondition_1d (`bool`, *optional*, defaults to `False`):
+            Whether or not to precondition 1D gradients.
+        normalize_grads (`bool`, *optional*, defaults to `False`):
+            Whether or not to normalize gradients per layer.
+            Helps at large precondition_frequency (~100 in our experiments),
+            but hurts performance at small precondition_frequency (~10 in our experiments).
+        data_format (`str`, *optional*, defaults to `channels_first`):
+            Data format of the input for convolutional layers.
+            Should be "channels_last" for data_format of NHWC and "channels_first" for NCHW.
+        correct_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use bias correction in Adam.
+    """
+
+    def __init__(
+        self,
+        params,
+        lr: float = 3e-3,
+        betas=(0.95, 0.95),
+        shampoo_beta: float = -1,
+        eps: float = 1e-8,
+        weight_decay: float = 0.01,
+        precondition_frequency: int = 10,
+        max_precond_dim: int = 10000,  #
+        merge_dims: bool = False,  # Merge dimensions till the product of the dimensions is less than or equal to max_precond_dim.
+        precondition_1d: bool = False,
+        normalize_grads: bool = False,
+        data_format: str = "channels_first",
+        correct_bias: bool = True,
+    ):
+        defaults = {
+            "lr": lr,
+            "betas": betas,
+            "shampoo_beta": shampoo_beta,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "precondition_frequency": precondition_frequency,
+            "max_precond_dim": max_precond_dim,
+            "merge_dims": merge_dims,
+            "precondition_1d": precondition_1d,
+            "normalize_grads": normalize_grads,
+            "correct_bias": correct_bias,
+        }
+        super().__init__(params, defaults)
+        self._data_format = data_format
+
+    def merge_dims(self, grad, max_precond_dim):
+        """
+        Merges dimensions of the gradient tensor till the product of the dimensions is less than or equal to max_precond_dim.
+        """
+        assert self._data_format in ["channels_first", "channels_last"]
+        if self._data_format == "channels_last" and grad.dim() == 4:
+            grad = grad.permute(0, 3, 1, 2)
+        shape = grad.shape
+        new_shape = []
+
+        curr_shape = 1
+        for sh in shape:
+            temp_shape = curr_shape * sh
+            if temp_shape > max_precond_dim:
+                if curr_shape > 1:
+                    new_shape.append(curr_shape)
+                    curr_shape = sh
+                else:
+                    new_shape.append(sh)
+                    curr_shape = 1
+            else:
+                curr_shape = temp_shape
+
+        if curr_shape > 1 or len(new_shape) == 0:
+            new_shape.append(curr_shape)
+
+        new_grad = grad.reshape(new_shape)
+        return new_grad
+
+    @torch.no_grad()
+    def step(self):
+        """
+        Performs a single optimization step.
+
+        Arguments:
+            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
+        """
+        loss = None
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                grad = p.grad
+
+                state = self.state[p]
+
+                if "step" not in state:
+                    state["step"] = 0
+
+                    # State initialization
+                if "exp_avg" not in state:
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(grad)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(grad)
+
+                if "Q" not in state:
+                    self.init_preconditioner(
+                        grad,
+                        state,
+                        precondition_frequency=group["precondition_frequency"],
+                        precondition_1d=group["precondition_1d"],
+                        shampoo_beta=(
+                            group["shampoo_beta"]
+                            if group["shampoo_beta"] >= 0
+                            else group["betas"][1]
+                        ),
+                        max_precond_dim=group["max_precond_dim"],
+                        merge_dims=group["merge_dims"],
+                    )
+                    self.update_preconditioner(
+                        grad,
+                        state,
+                        max_precond_dim=group["max_precond_dim"],
+                        merge_dims=group["merge_dims"],
+                        precondition_1d=group["precondition_1d"],
+                    )
+                    continue  # first step is skipped so that we never use the current gradients in the projection.
+
+                # Projecting gradients to the eigenbases of Shampoo's preconditioner
+                # i.e. projecting to the eigenbases of matrices in state['GG']
+                grad_projected = self.project(
+                    grad,
+                    state,
+                    merge_dims=group["merge_dims"],
+                    max_precond_dim=group["max_precond_dim"],
+                )
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+
+                state["step"] += 1
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
+                exp_avg_sq.mul_(beta2).add_(
+                    grad_projected.square(), alpha=(1.0 - beta2)
+                )
+
+                denom = exp_avg_sq.sqrt().add_(group["eps"])
+
+                # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
+                # i.e. projecting to the eigenbases of matrices in state['GG']
+                exp_avg_projected = self.project(
+                    exp_avg,
+                    state,
+                    merge_dims=group["merge_dims"],
+                    max_precond_dim=group["max_precond_dim"],
+                )
+
+                step_size = group["lr"]
+                if group["correct_bias"]:
+                    bias_correction1 = 1.0 - beta1 ** (state["step"])
+                    bias_correction2 = 1.0 - beta2 ** (state["step"])
+                    step_size = step_size * (bias_correction2**0.5) / bias_correction1
+
+                # Projecting back the preconditioned (by Adam) exponential moving average of gradients
+                # to the original space
+                norm_grad = self.project_back(
+                    exp_avg_projected / denom,
+                    state,
+                    merge_dims=group["merge_dims"],
+                    max_precond_dim=group["max_precond_dim"],
+                )
+
+                if group["normalize_grads"]:
+                    norm_grad = norm_grad / (1e-30 + torch.mean(norm_grad**2) ** 0.5)
+
+                p.add_(norm_grad, alpha=-step_size)
+
+                # From AdamW code: Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                if group["weight_decay"] > 0.0:
+                    p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
+
+                # Update is done after the gradient step to avoid using current gradients in the projection.
+                self.update_preconditioner(
+                    grad,
+                    state,
+                    max_precond_dim=group["max_precond_dim"],
+                    merge_dims=group["merge_dims"],
+                    precondition_1d=group["precondition_1d"],
+                )
+
+        return loss
+
+    def init_preconditioner(
+        self,
+        grad,
+        state,
+        precondition_frequency=10,
+        shampoo_beta=0.95,
+        max_precond_dim=10000,
+        precondition_1d=False,
+        merge_dims=False,
+    ):
+        """
+        Initializes the preconditioner matrices (L and R in the paper).
+        """
+        state[
+            "GG"
+        ] = []  # Will hold all the preconditioner matrices (L and R in the paper).
+        if grad.dim() == 1:
+            if not precondition_1d or grad.shape[0] > max_precond_dim:
+                state["GG"].append([])
+            else:
+                state["GG"].append(
+                    torch.zeros(grad.shape[0], grad.shape[0], device=grad.device)
+                )
+        else:
+            if merge_dims:
+                grad = self.merge_dims(grad, max_precond_dim)
+
+            for sh in grad.shape:
+                if sh > max_precond_dim:
+                    state["GG"].append([])
+                else:
+                    state["GG"].append(torch.zeros(sh, sh, device=grad.device))
+
+        state["Q"] = None  # Will hold all the eigenbases of the preconditioner.
+        state["precondition_frequency"] = precondition_frequency
+        state["shampoo_beta"] = shampoo_beta
+
+    def project(self, grad, state, merge_dims=False, max_precond_dim=10000):
+        """
+        Projects the gradient to the eigenbases of the preconditioner.
+        """
+        original_shape = grad.shape
+        if merge_dims:
+            if grad.dim() == 4 and self._data_format == "channels_last":
+                permuted_shape = grad.permute(0, 3, 1, 2).shape
+            grad = self.merge_dims(grad, max_precond_dim)
+
+        for mat in state["Q"]:
+            if len(mat) > 0:
+                grad = torch.tensordot(
+                    grad,
+                    mat,
+                    dims=[[0], [0]],
+                )
+            else:
+                permute_order = list(range(1, len(grad.shape))) + [0]
+                grad = grad.permute(permute_order)
+
+        if merge_dims:
+            if self._data_format == "channels_last" and len(original_shape) == 4:
+                grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
+            else:
+                grad = grad.reshape(original_shape)
+        return grad
+
+    def update_preconditioner(
+        self,
+        grad,
+        state,
+        max_precond_dim=10000,
+        merge_dims=False,
+        precondition_1d=False,
+    ):
+        """
+        Updates the preconditioner matrices and the eigenbases (L, R, Q_L, Q_R in the paper).
+        """
+        if grad.dim() == 1:
+            if precondition_1d and grad.shape[0] <= max_precond_dim:
+                state["GG"][0].lerp_(
+                    grad.unsqueeze(1) @ grad.unsqueeze(0), 1 - state["shampoo_beta"]
+                )
+        else:
+            if merge_dims:
+                new_grad = self.merge_dims(grad, max_precond_dim)
+                for idx, sh in enumerate(new_grad.shape):
+                    if sh <= max_precond_dim:
+                        outer_product = torch.tensordot(
+                            new_grad,
+                            new_grad,
+                            dims=[
+                                [
+                                    *chain(
+                                        range(idx), range(idx + 1, len(new_grad.shape))
+                                    )
+                                ]
+                            ]
+                            * 2,
+                        )
+                        state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])
+            else:
+                for idx, sh in enumerate(grad.shape):
+                    if sh <= max_precond_dim:
+                        outer_product = torch.tensordot(
+                            grad,
+                            grad,
+                            # Contracts across all dimensions except for k.
+                            dims=[[*chain(range(idx), range(idx + 1, len(grad.shape)))]]
+                            * 2,
+                        )
+                        state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])
+
+        if state["Q"] is None:
+            state["Q"] = self.get_orthogonal_matrix(state["GG"])
+        if state["step"] > 0 and state["step"] % state["precondition_frequency"] == 0:
+            state["Q"] = self.get_orthogonal_matrix_QR(
+                state, max_precond_dim, merge_dims
+            )
+
+    def project_back(self, grad, state, merge_dims=False, max_precond_dim=10000):
+        """
+        Projects the gradient back to the original space.
+        """
+        original_shape = grad.shape
+        if merge_dims:
+            if self._data_format == "channels_last" and grad.dim() == 4:
+                permuted_shape = grad.permute(0, 3, 1, 2).shape
+            grad = self.merge_dims(grad, max_precond_dim)
+        for mat in state["Q"]:
+            if len(mat) > 0:
+                grad = torch.tensordot(
+                    grad,
+                    mat,
+                    dims=[[0], [1]],
+                )
+            else:
+                permute_order = list(range(1, len(grad.shape))) + [0]
+                grad = grad.permute(permute_order)
+
+        if merge_dims:
+            if self._data_format == "channels_last" and len(original_shape) == 4:
+                grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
+            else:
+                grad = grad.reshape(original_shape)
+        return grad
+
+    def get_orthogonal_matrix(self, mat):
+        """
+        Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
+        """
+        matrix = []
+        for m in mat:
+            if len(m) == 0:
+                matrix.append([])
+                continue
+            if m.data.dtype != torch.float:
+                float_data = False
+                original_type = m.data.dtype
+                original_device = m.data.device
+                matrix.append(m.data.float())
+            else:
+                float_data = True
+                matrix.append(m.data)
+
+        final = []
+        for m in matrix:
+            if len(m) == 0:
+                final.append([])
+                continue
+            try:
+                _, Q = torch.linalg.eigh(
+                    m + 1e-30 * torch.eye(m.shape[0], device=m.device)
+                )
+            except:  # pylint: disable=bare-except # noqa: E722
+                _, Q = torch.linalg.eigh(
+                    m.to(torch.float64) + 1e-30 * torch.eye(m.shape[0], device=m.device)
+                )
+                Q = Q.to(m.dtype)
+            Q = torch.flip(Q, [1])
+
+            if not float_data:
+                Q = Q.to(original_device).type(original_type)
+            final.append(Q)
+        return final
+
+    def get_orthogonal_matrix_QR(self, state, max_precond_dim=10000, merge_dims=False):
+        """
+        Computes the eigenbases of the preconditioner using one round of power iteration
+        followed by torch.linalg.qr decomposition.
+        """
+        precond_list = state["GG"]
+        orth_list = state["Q"]
+
+        matrix = []
+        orth_matrix = []
+        for m, o in zip(precond_list, orth_list):
+            if len(m) == 0:
+                matrix.append([])
+                orth_matrix.append([])
+                continue
+            if m.data.dtype != torch.float:
+                float_data = False
+                original_type = m.data.dtype
+                original_device = m.data.device
+                matrix.append(m.data.float())
+                orth_matrix.append(o.data.float())
+            else:
+                float_data = True
+                matrix.append(m.data.float())
+                orth_matrix.append(o.data.float())
+
+        orig_shape = state["exp_avg_sq"].shape
+        if self._data_format == "channels_last" and len(orig_shape) == 4:
+            permuted_shape = state["exp_avg_sq"].permute(0, 3, 1, 2).shape
+        if merge_dims:
+            exp_avg_sq = self.merge_dims(state["exp_avg_sq"], max_precond_dim)
+        else:
+            exp_avg_sq = state["exp_avg_sq"]
+
+        final = []
+        for ind, (m, o) in enumerate(zip(matrix, orth_matrix)):
+            if len(m) == 0:
+                final.append([])
+                continue
+            est_eig = torch.diag(o.T @ m @ o)
+            sort_idx = torch.argsort(est_eig, descending=True)
+            exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
+            o = o[:, sort_idx]
+            power_iter = m @ o
+            Q, _ = torch.linalg.qr(power_iter)
+
+            if not float_data:
+                Q = Q.to(original_device).type(original_type)
+            final.append(Q)
+
+        if merge_dims:
+            if self._data_format == "channels_last" and len(orig_shape) == 4:
+                exp_avg_sq = exp_avg_sq.reshape(permuted_shape).permute(0, 2, 3, 1)
+            else:
+                exp_avg_sq = exp_avg_sq.reshape(orig_shape)
+
+        state["exp_avg_sq"] = exp_avg_sq
+        return final
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -1,6 +1,8 @@
 """Module for tokenization utilities"""

 import logging
+import re
+from typing import Dict, List

 from termcolor import colored

@@ -66,47 +68,90 @@ def process_tokens_for_rl_debug(tokens, color, tokenizer, text_only):


 def check_rl_example_labels(example, tokenizer, text_only=False):
-    field_prompt, field_chosen, field_rejected, field_completion = (
-        "prompt",
-        "chosen",
-        "rejected",
-        "completion",
-    )
+    field_prompt, field_chosen, field_rejected = "prompt", "chosen", "rejected"

    input_tokens = example[field_prompt]
-
-    labels_chosen = example.get(field_chosen)
-    labels_rejected = example.get(field_rejected)
-    labels_completion = example.get(field_completion)
-
-    # Create a delimiter based on text_only flag
-    delimiter = "" if text_only else " "
+    labels_chosen, labels_rejected = example[field_chosen], example[field_rejected]

    # Process and color each type of token
    colored_tokens = process_tokens_for_rl_debug(
        input_tokens, "yellow", tokenizer, text_only
    )
+    colored_chosens = process_tokens_for_rl_debug(
+        labels_chosen, "green", tokenizer, text_only
+    )
+    colored_rejecteds = process_tokens_for_rl_debug(
+        labels_rejected, "red", tokenizer, text_only
+    )

-    # Process tokens
-    if labels_completion is None:
-        colored_chosens = process_tokens_for_rl_debug(
-            labels_chosen, "green", tokenizer, text_only
-        )
-        colored_rejecteds = process_tokens_for_rl_debug(
-            labels_rejected, "red", tokenizer, text_only
-        )
-    else:
-        colored_completion = process_tokens_for_rl_debug(
-            labels_completion, "green", tokenizer, text_only
-        )
+    # Create a delimiter based on text_only flag
+    delimiter = "" if text_only else " "

    # Logging information
    LOG.info(f"INPUT PROMPT: {delimiter.join(colored_tokens)}\n\n")
-
-    if labels_completion is None:
-        LOG.info(f"CHOSEN RESPONSE: {delimiter.join(colored_chosens)}\n\n")
-        LOG.info(f"REJECTED RESPONSE: {delimiter.join(colored_rejecteds)}\n\n\n")
-    else:
-        LOG.info(f"COMPLETION RESPONSE: {delimiter.join(colored_completion)}\n\n\n")
+    LOG.info(f"CHOSEN RESPONSE: {delimiter.join(colored_chosens)}\n\n")
+    LOG.info(f"REJECTED RESPONSE: {delimiter.join(colored_rejecteds)}\n\n\n")

    return delimiter.join(colored_tokens)
+
+
+GLAIVE_ROLES = ["USER", "ASSISTANT", "FUNCTION RESPONSE"]
+GLAIVE_TO_SHAREGPT_ROLE = {
+    "SYSTEM": "system",
+    "USER": "human",
+    "ASSISTANT": "gpt",
+    "FUNCTION RESPONSE": "tool",
+}
+
+GLAIVE_MSG_REGEX = re.compile(rf"({'|'.join(GLAIVE_ROLES)}): ")
+
+
+def chatml_to_conversation(row: Dict[str, str]) -> List[Dict[str, str]]:
+    """
+    Converts a ChatML formatted row to a list of messages in ShareGPT format.
+    Initially based off https://github.com/lilacai/lilac/blob/main/notebooks/GlaiveToShareGPT.ipynb.
+    """
+
+    system_prompt = row.get("system")
+    if system_prompt:
+        system_prompt = system_prompt.removeprefix("SYSTEM: ")
+
+    chat_str = row["chat"]
+    chat_msgs = [s.strip() for s in GLAIVE_MSG_REGEX.split(chat_str) if s]
+
+    chat_msg_dicts = [
+        {"from": GLAIVE_TO_SHAREGPT_ROLE[role], "value": value}
+        for role, value in zip(chat_msgs[::2], chat_msgs[1::2])
+    ]
+
+    if system_prompt:
+        chat_msg_dicts = [
+            {"from": GLAIVE_TO_SHAREGPT_ROLE["SYSTEM"], "value": system_prompt}
+        ] + chat_msg_dicts
+
+    return chat_msg_dicts
+
+
+def merge_consecutive_messages(messages):
+    """
+    Merge consecutive messages from the same sender into a single message.
+    This can be useful with datasets that contain multiple consecutive tool calls.
+    """
+
+    merged_messages = []
+    current_from = None
+    current_message = ""
+
+    for msg in messages:
+        if current_from == msg["from"]:
+            current_message += msg["value"]
+        else:
+            if current_from is not None:
+                merged_messages.append({"from": current_from, "value": current_message})
+            current_from = msg["from"]
+            current_message = msg["value"]
+
+    if current_from is not None:
+        merged_messages.append({"from": current_from, "value": current_message})
+
+    return merged_messages
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -17,7 +17,6 @@ from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.utils.distributed import reduce_and_broadcast
-from axolotl.utils.environment import check_cuda_p2p_ib_support
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

 LOG = get_logger("axolotl")
@@ -185,10 +184,11 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
        min_sequence_len=cfg.min_sample_len or 2,
    )

-    min_input_len = np.min(get_dataset_lengths(train_dataset))
-    LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
-    max_input_len = np.max(get_dataset_lengths(train_dataset))
-    LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
+    if cfg.is_preprocess:
+        min_input_len = np.min(get_dataset_lengths(train_dataset))
+        LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
+        max_input_len = np.max(get_dataset_lengths(train_dataset))
+        LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)

    if cfg.model_config_type == "mamba":
        LOG.info("dropping attention_mask column")
@@ -203,59 +203,37 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
        if eval_dataset and "token_type_ids" in eval_dataset.column_names:
            eval_dataset = eval_dataset.remove_columns("token_type_ids")

-    prior_len = len(train_dataset)
    train_dataset = train_dataset.filter(
        drop_long,
        num_proc=cfg.dataset_processes,
        load_from_cache_file=not cfg.is_preprocess,
        desc="Dropping Long Sequences",
    )
-    dropped = prior_len - len(train_dataset)
-    if dropped:
-        LOG.warning(f"Dropped {dropped} long samples from train dataset")
-
    if eval_dataset:
-        prior_len = len(eval_dataset)
        eval_dataset = eval_dataset.filter(
            drop_long,
            num_proc=cfg.dataset_processes,
            load_from_cache_file=not cfg.is_preprocess,
            desc="Dropping Long Sequences",
        )
-        dropped = prior_len - len(eval_dataset)
-        if dropped:
-            LOG.warning(f"Dropped {dropped} long samples from eval dataset")

    # drop samples with where the number of elements with labels not equal to -100 is zero
    def drop_no_trainable_tokens(sample):
        return np.sum(np.array(sample["labels"]) != -100) > 0

-    prior_len = len(train_dataset)
    train_dataset = train_dataset.filter(
        drop_no_trainable_tokens,
        num_proc=cfg.dataset_processes,
        load_from_cache_file=not cfg.is_preprocess,
        desc="Drop Samples with Zero Trainable Tokens",
    )
-    dropped = prior_len - len(train_dataset)
-    if dropped:
-        LOG.warning(
-            f"Dropped {dropped} samples with no trainable tokens from train dataset"
-        )
-
    if eval_dataset:
-        prior_len = len(eval_dataset)
        eval_dataset = eval_dataset.filter(
            drop_no_trainable_tokens,
            num_proc=cfg.dataset_processes,
            load_from_cache_file=not cfg.is_preprocess,
            desc="Drop Samples with Zero Trainable Tokens",
        )
-        dropped = prior_len - len(eval_dataset)
-        if dropped:
-            LOG.warning(
-                f"Dropped {dropped} samples with no trainable tokens from eval dataset"
-            )

    if cfg.group_by_length:
        train_dataset = train_dataset.map(
@@ -483,9 +461,6 @@ def setup_fsdp_envs(cfg):


 def prepare_optim_env(cfg):
-    if not check_cuda_p2p_ib_support():
-        if os.getenv("NCCL_P2P_DISABLE") is None:
-            os.environ["NCCL_P2P_DISABLE"] = "1"
    if cfg.fsdp:
        setup_fsdp_envs(cfg)
    elif cfg.deepspeed:
@@ -515,7 +490,7 @@ def prepare_opinionated_env(cfg):
 def setup_trainer(
    cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps
 ):
-    if cfg.rl in ("dpo", "ipo", "orpo", "kto", "simpo"):
+    if cfg.rl in ["dpo", "ipo", "orpo", "kto", "simpo"]:
        trainer_builder = HFRLTrainerBuilder(cfg, model[0], tokenizer, processor)
        trainer_builder.model_ref = model[1]
        trainer_builder.peft_config = model[2]
--- a/tests/e2e/conftest.py
+++ b/tests/e2e/conftest.py
@@ -1,35 +0,0 @@
-"""
-shared pytest fixtures
-"""
-import shutil
-import tempfile
-
-import pytest
-from huggingface_hub import snapshot_download
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_smollm2_135m_model():
-    # download the model
-    snapshot_download("HuggingFaceTB/SmolLM2-135M")
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_tatsu_lab_alpaca_dataset():
-    # download the model
-    snapshot_download("tatsu-lab/alpaca", repo_type="dataset")
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_mhenrichsen_alpaca_2k_dataset():
-    # download the model
-    snapshot_download("mhenrichsen/alpaca_2k_test", repo_type="dataset")
-
-
-@pytest.fixture
-def temp_dir():
-    # Create a temporary directory
-    _temp_dir = tempfile.mkdtemp()
-    yield _temp_dir
-    # Clean up the directory after the test
-    shutil.rmtree(_temp_dir)
--- a/tests/e2e/integrations/liger.py
+++ b/tests/e2e/integrations/liger.py
@@ -1,6 +1,7 @@
 """
 Simple end-to-end test for Liger integration
 """
+
 import unittest
 from pathlib import Path

--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -3,25 +3,28 @@ E2E tests for multigpu eval
 """
 import logging
 import os
+import unittest
 from pathlib import Path

 import yaml
 from accelerate.test_utils import execute_subprocess_async
-from transformers.testing_utils import get_torch_dist_unique_port

 from axolotl.utils.dict import DictDefault

+from ..utils import with_temp_dir
+
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"

 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


-class TestMultiGPUEval:
+class TestMultiGPUEval(unittest.TestCase):
    """
    Test case for MultiGPU Eval Sample Packing
    """

+    @with_temp_dir
    def test_eval_sample_packing(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
@@ -80,14 +83,13 @@ class TestMultiGPUEval:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

+    @with_temp_dir
    def test_eval(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
@@ -146,8 +148,6 @@ class TestMultiGPUEval:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -4,17 +4,17 @@ E2E tests for multigpu lora tinyllama

 import logging
 import os
+import unittest
 from pathlib import Path

 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
 from huggingface_hub import snapshot_download
-from transformers.testing_utils import get_torch_dist_unique_port

 from axolotl.utils.dict import DictDefault

-from ..utils import is_hopper
+from ..utils import is_hopper, with_temp_dir

 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"
@@ -25,19 +25,21 @@ AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
@pytest.fixture(scope="session", autouse=True)
 def download_model():
    # download the model
-    snapshot_download("HuggingFaceTB/SmolLM2-135M")
+    snapshot_download("TinyLlama/TinyLlama_v1.1")


-class TestMultiGPULlama:
+class TestMultiGPULlama(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

+    @with_temp_dir
    def test_lora_ddp(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 2048,
                "adapter": "lora",
                "lora_r": 8,
@@ -46,7 +48,9 @@ class TestMultiGPULlama:
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
                },
                "datasets": [
                    {
@@ -77,23 +81,19 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

-    @pytest.mark.parametrize(
-        "gradient_accumulation_steps",
-        [1, 4],
-    )
-    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
+    @with_temp_dir
+    def test_lora_ddp_packed(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 2048,
                "sample_packing": True,
                "eval_sample_packing": False,
@@ -105,7 +105,9 @@ class TestMultiGPULlama:
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
                },
                "datasets": [
                    {
@@ -116,7 +118,7 @@ class TestMultiGPULlama:
                "num_epochs": 1,
                "max_steps": 15,
                "micro_batch_size": 4,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "gradient_accumulation_steps": 4,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
@@ -136,8 +138,6 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
@@ -145,11 +145,13 @@ class TestMultiGPULlama:
        )

    @pytest.mark.skipif(is_hopper(), reason="h100 doesn't support 8-bit lora")
+    @with_temp_dir
    def test_dpo_lora_ddp(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 2048,
                "sample_packing": False,
                "eval_sample_packing": False,
@@ -162,10 +164,12 @@ class TestMultiGPULlama:
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
                },
                "rl": "dpo",
-                "chat_template": "chatml",
+                "chat_template": "llama3",
                "datasets": [
                    {
                        "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
@@ -206,19 +210,18 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

+    @with_temp_dir
    def test_dpo_qlora_ddp(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                "sequence_len": 2048,
                "sample_packing": False,
                "eval_sample_packing": False,
@@ -275,94 +278,25 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

-    @pytest.mark.parametrize(
-        "gradient_accumulation_steps",
-        [1, 4],
-    )
-    def test_fsdp(self, temp_dir, gradient_accumulation_steps):
+    @with_temp_dir
+    def test_fsdp(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sequence_len": 2048,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 10,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "FULL_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "accelerate",
-                "launch",
-                "--num-processes",
-                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
-            ]
-        )
-
-    @pytest.mark.parametrize(
-        "fsdp_state_dict_type",
-        ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
-    )
-    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 2048,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
                },
                "datasets": [
                    {
@@ -390,7 +324,7 @@ class TestMultiGPULlama:
                    "fsdp_use_orig_params": False,
                    "fsdp_cpu_ram_efficient_loading": False,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": fsdp_state_dict_type,
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
            }
@@ -407,19 +341,85 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

+    @with_temp_dir
+    def test_fsdp_packed(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "sequence_len": 2048,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 15,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 4,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": False,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+            }
+        )
+
+        # write cfg to yaml file
+        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+        execute_subprocess_async(
+            [
+                "accelerate",
+                "launch",
+                "--num-processes",
+                "2",
+                "-m",
+                "axolotl.cli.train",
+                str(Path(temp_dir) / "config.yaml"),
+            ]
+        )
+
+    @with_temp_dir
    def test_fsdp_qlora_prequant_packed(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
+                "base_model": "axolotl-ai-co/TinyLlama_v1.1-bnb-nf4-bf16",
+                "tokenizer_type": "AutoTokenizer",
                "adapter": "qlora",
                "mean_resizing_embeddings": True,
                "load_in_4bit": True,
@@ -427,17 +427,17 @@ class TestMultiGPULlama:
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
-                # "lora_modules_to_save": [
-                #     "embed_tokens",
-                #     "lm_head",
-                # ],
+                "lora_modules_to_save": [
+                    "embed_tokens",
+                    "lm_head",
+                ],
                "sample_packing": True,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "pad_token": "</s>",
                },
                "datasets": [
                    {
@@ -483,29 +483,28 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

-    @pytest.mark.parametrize(
-        "gradient_accumulation_steps",
-        [1, 4],
-    )
-    def test_ds_zero3_packed(self, temp_dir, gradient_accumulation_steps):
+    @with_temp_dir
+    def test_ds_zero3_packed(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
                "sample_packing": True,
+                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
                },
                "datasets": [
                    {
@@ -516,7 +515,7 @@ class TestMultiGPULlama:
                "num_epochs": 1,
                "max_steps": 15,
                "micro_batch_size": 4,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "gradient_accumulation_steps": 4,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
@@ -537,19 +536,19 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

+    @with_temp_dir
    def test_ds_zero3_qlora_packed(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "TinyLlama/TinyLlama_v1.1",
+                "tokenizer_type": "LlamaTokenizer",
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 8,
@@ -562,7 +561,9 @@ class TestMultiGPULlama:
                "sequence_len": 2048,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
+                    "bos_token": "<s>",
+                    "eos_token": "</s>",
                },
                "datasets": [
                    {
@@ -594,8 +595,6 @@ class TestMultiGPULlama:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
--- a/tests/e2e/multigpu/test_qwen2.py
+++ b/tests/e2e/multigpu/test_qwen2.py
@@ -4,30 +4,31 @@ E2E tests for multigpu qwen2

 import logging
 import os
+import unittest
 from pathlib import Path

-import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
-from transformers.testing_utils import get_torch_dist_unique_port

 from axolotl.utils.dict import DictDefault

+from ..utils import with_temp_dir
+
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"


-class TestMultiGPUQwen2:
+class TestMultiGPUQwen2(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

-    @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"])
-    def test_qlora_fsdp_dpo(self, base_model, temp_dir):
+    @with_temp_dir
+    def test_qlora_fsdp_dpo(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": base_model,
+                "base_model": "Qwen/Qwen2-1.5B",
                "load_in_4bit": True,
                "rl": "dpo",
                "chat_template": "chatml",
@@ -46,9 +47,9 @@ class TestMultiGPUQwen2:
                    },
                ],
                "num_epochs": 1,
-                "max_steps": 5,
+                "max_steps": 15,
                "warmup_steps": 20,
-                "micro_batch_size": 2,
+                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
@@ -90,8 +91,6 @@ class TestMultiGPUQwen2:
                "launch",
                "--num-processes",
                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -66,8 +66,6 @@ class TestFAXentropyLlama(unittest.TestCase):
                    },
                ],
                "num_epochs": 1,
-                "max_steps": 10,
-                "save_steps": 10,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -56,8 +56,6 @@ class TestLoraLlama(unittest.TestCase):
                    },
                ],
                "num_epochs": 2,
-                "max_steps": 20,
-                "save_steps": 10,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
@@ -111,7 +109,6 @@ class TestLoraLlama(unittest.TestCase):
                    },
                ],
                "num_epochs": 2,
-                "max_steps": 20,
                "save_steps": 0.5,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
--- a/tests/e2e/test_dpo.py
+++ b/tests/e2e/test_dpo.py
@@ -115,51 +115,6 @@ class TestDPOLlamaLora(unittest.TestCase):
        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "checkpoint-20/adapter_model.safetensors").exists()

-    @with_temp_dir
-    def test_dpo_use_weighting(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
-                "sequence_len": 1024,
-                "load_in_8bit": True,
-                "adapter": "lora",
-                "lora_r": 64,
-                "lora_alpha": 32,
-                "lora_dropout": 0.1,
-                "lora_target_linear": True,
-                "special_tokens": {},
-                "rl": "dpo",
-                "dpo_use_weighting": True,
-                "datasets": [
-                    {
-                        "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized",
-                        "type": "chatml.ultra",
-                        "split": "train",
-                    },
-                ],
-                "num_epochs": 1,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "paged_adamw_8bit",
-                "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "warmup_steps": 5,
-                "gradient_checkpointing": True,
-                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-            }
-        )
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "checkpoint-20/adapter_model.safetensors").exists()
-
    @pytest.mark.skip("kto_pair no longer supported in trl")
    @with_temp_dir
    def test_kto_pair_lora(self, temp_dir):
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -1,66 +0,0 @@
-"""
-E2E tests for llama
-"""
-
-import logging
-import os
-import unittest
-from pathlib import Path
-
-from axolotl.cli import load_datasets
-from axolotl.common.cli import TrainerCliArgs
-from axolotl.train import train
-from axolotl.utils.config import normalize_config
-from axolotl.utils.dict import DictDefault
-
-from .utils import with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestLlama(unittest.TestCase):
-    """
-    Test case for Llama models
-    """
-
-    @with_temp_dir
-    def test_fft_trust_remote_code(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
-                "trust_remote_code": True,
-                "sequence_len": 512,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "micro_batch_size": 8,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "sample_packing": True,
-                "bf16": True,
-                "save_safetensors": True,
-            }
-        )
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "model.safetensors").exists()
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -13,7 +13,7 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from .utils import require_torch_2_5_1, with_temp_dir
+from .utils import with_temp_dir

 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
@@ -67,13 +67,11 @@ class TestCustomOptimizers(unittest.TestCase):
        assert (Path(temp_dir) / "adapter_model.bin").exists()

    @with_temp_dir
-    @require_torch_2_5_1
-    def test_adopt_adamw(self, temp_dir):
+    def test_soap(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
@@ -83,13 +81,11 @@ class TestCustomOptimizers(unittest.TestCase):
                "lora_target_linear": True,
                "val_set_size": 0.1,
                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
+                        "path": "vicgalle/alpaca-gpt4",
                        "type": "alpaca",
                    },
                ],
@@ -98,7 +94,9 @@ class TestCustomOptimizers(unittest.TestCase):
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
-                "optimizer": "adopt_adamw",
+                "optimizer": "soap",
+                "optim_soap_beta1": 0.95,
+                "optim_soap_beta2": 0.95,
                "lr_scheduler": "cosine",
            }
        )
@@ -108,37 +106,3 @@ class TestCustomOptimizers(unittest.TestCase):

        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.bin").exists()
-
-    @with_temp_dir
-    def test_fft_schedule_free_adamw(self, temp_dir):
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sequence_len": 1024,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 2,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "schedule_free_adamw",
-                "lr_scheduler": "constant",
-                "save_safetensors": True,
-            }
-        )
-        # pylint: disable=duplicate-code
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "model.safetensors").exists()
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -31,7 +31,7 @@ class TestPackedLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
--- a/tests/e2e/test_qwen.py
+++ b/tests/e2e/test_qwen.py
@@ -1,85 +0,0 @@
-"""
-E2E tests for qwen
-"""
-
-import logging
-import os
-from pathlib import Path
-
-import pytest
-import yaml
-from accelerate.test_utils import execute_subprocess_async
-from transformers.testing_utils import get_torch_dist_unique_port
-
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger("axolotl.tests.qwen")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestE2eQwen:
-    """
-    Test cases for qwen models
-    """
-
-    @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"])
-    def test_dpo(self, base_model, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": base_model,
-                "rl": "dpo",
-                "chat_template": "qwen_25",
-                "sequence_len": 2048,
-                "val_set_size": 0.0,
-                "datasets": [
-                    {
-                        "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
-                        "split": "train",
-                        "type": "chat_template.default",
-                        "field_messages": "conversation",
-                        "field_chosen": "chosen",
-                        "field_rejected": "rejected",
-                        "message_field_role": "role",
-                        "message_field_content": "content",
-                        "roles": {
-                            "system": ["system"],
-                            "user": ["user"],
-                            "assistant": ["assistant"],
-                        },
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 5,
-                "warmup_steps": 20,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 2,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "bf16": "auto",
-                "tf32": True,
-                "gradient_checkpointing": True,
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "accelerate",
-                "launch",
-                "--num-processes",
-                "2",
-                "--main_process_port",
-                f"{get_torch_dist_unique_port()}",
-                "-m",
-                "axolotl.cli.train",
-                str(Path(temp_dir) / "config.yaml"),
-            ]
-        )
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -6,13 +6,11 @@ import shutil
 import tempfile
 import unittest
 from functools import wraps
+from importlib.metadata import version
 from pathlib import Path

 import torch

-# from importlib.metadata import version
-from packaging import version
-

 def with_temp_dir(test_func):
    @wraps(test_func)
@@ -45,24 +43,12 @@ def require_torch_2_3_1(test_case):
    """

    def is_min_2_3_1():
-        torch_version = version.parse(torch.__version__)
-        return torch_version >= version.parse("2.3.1")
+        torch_version = version("torch")
+        return torch_version >= "2.3.1"

    return unittest.skipUnless(is_min_2_3_1(), "test torch 2.3.1")(test_case)


-def require_torch_2_5_1(test_case):
-    """
-    Decorator marking a test that requires torch >= 2.3.1
-    """
-
-    def is_min_2_5_1():
-        torch_version = version.parse(torch.__version__)
-        return torch_version >= version.parse("2.5.1")
-
-    return unittest.skipUnless(is_min_2_5_1(), "test torch 2.5.1")(test_case)
-
-
 def is_hopper():
    compute_capability = torch.cuda.get_device_capability()
    return compute_capability == (9, 0)
--- a/tests/integrations/liger.py
+++ b/tests/integrations/liger.py
@@ -1,80 +0,0 @@
-"""
-config validation tests for swiglu args
-"""
-# pylint: disable=duplicate-code
-import logging
-from typing import Optional
-
-import pytest
-
-from axolotl.utils.config import validate_config
-from axolotl.utils.dict import DictDefault
-
-
-@pytest.fixture(name="minimal_base_cfg")
-def fixture_cfg():
-    return DictDefault(
-        {
-            "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
-            "learning_rate": 0.000001,
-            "datasets": [
-                {
-                    "path": "mhenrichsen/alpaca_2k_test",
-                    "type": "alpaca",
-                }
-            ],
-            "micro_batch_size": 1,
-            "gradient_accumulation_steps": 1,
-        }
-    )
-
-
-class BaseValidation:
-    """
-    Base validation module to setup the log capture
-    """
-
-    _caplog: Optional[pytest.LogCaptureFixture] = None
-
-    @pytest.fixture(autouse=True)
-    def inject_fixtures(self, caplog):
-        self._caplog = caplog
-
-
-# pylint: disable=too-many-public-methods
-class TestValidation(BaseValidation):
-    """
-    Test the validation module for liger
-    """
-
-    def test_deprecated_swiglu(self, minimal_cfg):
-        test_cfg = DictDefault(
-            {
-                "liger_swiglu": False,
-            }
-            | minimal_cfg
-        )
-
-        with self._caplog.at_level(logging.WARNING):
-            updated_cfg = validate_config(test_cfg)
-            assert (
-                "The 'liger_swiglu' argument is deprecated"
-                in self._caplog.records[0].message
-            )
-            assert updated_cfg.liger_swiglu is None
-            assert updated_cfg.liger_glu_activations is False
-
-    def test_conflict_swiglu_ligergluactivation(self, minimal_cfg):
-        test_cfg = DictDefault(
-            {
-                "liger_swiglu": False,
-                "liger_glu_activations": True,
-            }
-            | minimal_cfg
-        )
-
-        with pytest.raises(
-            ValueError,
-            match=r".*You cannot have both `liger_swiglu` and `liger_glu_activation` set.*",
-        ):
-            validate_config(test_cfg)
--- a/tests/prompt_strategies/test_sharegpt.py
+++ b/tests/prompt_strategies/test_sharegpt.py
@@ -0,0 +1,500 @@
+"""
+Test module for sharegpt integration w chatml
+"""
+
+import pytest
+from datasets import Dataset
+from tokenizers import AddedToken
+from transformers import AutoTokenizer
+
+from axolotl.datasets import TokenizedPromptDataset
+from axolotl.prompt_strategies.sharegpt import (
+    GlaiveShareGPTPromptTokenizingStrategy,
+    SimpleShareGPTPromptTokenizingStrategy,
+    register_chatml_template,
+    register_llama3_template,
+)
+from axolotl.prompters import ShareGPTPrompterV2
+
+register_chatml_template()
+register_llama3_template()
+
+
+@pytest.fixture(name="sharegpt_dataset")
+def fixture_sharegpt_dataset():
+    return Dataset.from_list(
+        [
+            {
+                "conversations": [
+                    {
+                        "from": "system",
+                        "value": "repeat",
+                    },
+                    {
+                        "from": "human",
+                        "value": "hello",
+                    },
+                    {
+                        "from": "gpt",
+                        "value": "hello",
+                    },
+                    {
+                        "from": "human",
+                        "value": "goodbye",
+                    },
+                    {
+                        "from": "gpt",
+                        "value": "goodbye",
+                    },
+                ]
+            }
+        ]
+    )
+
+
+@pytest.fixture(name="sharegpt_dataset_with_weights")
+def fixture_sharegpt_dataset_with_weights():
+    return Dataset.from_list(
+        [
+            {
+                "conversations": [
+                    {
+                        "from": "system",
+                        "value": "repeat",
+                    },
+                    {
+                        "from": "human",
+                        "value": "hello",
+                        "weight": 1,
+                    },
+                    {
+                        "from": "gpt",
+                        "value": "hello",
+                        "weight": 0,
+                    },
+                    {
+                        "from": "human",
+                        "value": "rehello",
+                        "weight": 0,
+                    },
+                    {
+                        "from": "gpt",
+                        "value": "rehello",
+                        "weight": 1,
+                    },
+                    {
+                        "from": "human",
+                        "value": "goodbye",
+                    },
+                    {
+                        "from": "gpt",
+                        "value": "goodbye",
+                        "weight": 0,
+                    },
+                ]
+            }
+        ]
+    )
+
+
+@pytest.fixture(name="glaive_dataset")
+def fixture_sharegpt_glaive_dataset():
+    return Dataset.from_list(
+        [
+            {
+                "system": "SYSTEM: This is a system prompt",
+                "chat": "USER: Can you book a flight for me from New York to London? ASSISTANT: I'm sorry, but I don't have the capability to book flights.  <|endoftext|>",
+            }
+        ]
+    )
+
+
+@pytest.fixture(name="multi_role_dataset")
+def fixture_multi_role_dataset():
+    return Dataset.from_list(
+        [
+            {
+                "conversations": [
+                    {
+                        "from": "system",
+                        "value": "use get_weather(city) to get the weather for a city",
+                    },
+                    {
+                        "from": "human",
+                        "value": "hello, what's the weather in New York?",
+                    },
+                    {
+                        "from": "gpt",
+                        "value": "let me get that for you",
+                    },
+                    {
+                        "from": "tool",
+                        "value": "get_weather(New York)",
+                    },
+                    {
+                        "from": "gpt",
+                        "value": "the weather in New York is 70 degrees and sunny",
+                    },
+                ]
+            }
+        ]
+    )
+
+
+@pytest.fixture(name="tokenizer")
+def fixture_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained(
+        "casperhansen/mistral-7b-instruct-v0.1-awq"
+    )
+    tokenizer.add_special_tokens(
+        {
+            "eos_token": AddedToken(
+                "<|im_end|>", rstrip=False, lstrip=False, normalized=False
+            )
+        }
+    )
+    tokenizer.add_tokens(
+        [
+            AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False),
+        ]
+    )
+
+    return tokenizer
+
+
+@pytest.fixture(name="llama3_tokenizer")
+def fixture_llama3_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B")
+    tokenizer.eos_token = "<|eot_id|>"
+
+    return tokenizer
+
+
+class TestSharegptLlama3:
+    """Test class for ShareGPT style datasets with llama-3 prompts"""
+
+    def test_tokenization(self, sharegpt_dataset, llama3_tokenizer):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="llama3",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            llama3_tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset, process_count=1
+        )
+
+        input_ids = dataset_wrapper[0]["input_ids"]
+
+        # fmt: off
+        # pylint: disable=duplicate-code
+        assert input_ids == [
+            128000,  # bos
+            128006, 9125, 128007,  # system header
+            271, 31724, 128009,  # sys prompt, eot
+            128006, 882, 128007,  # user header
+            271, 15339, 128009,  # user prompt eot
+            128006, 78191, 128007,  # assistant header
+            271, 15339, 128009,   # assistant response eot
+            128006, 882, 128007,
+            271, 19045, 29474, 128009,
+            128006, 78191, 128007,
+            271, 19045, 29474, 128009,
+        ]
+        # fmt: on
+
+    def test_tokenization_with_weights(
+        self, sharegpt_dataset_with_weights, llama3_tokenizer
+    ):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="llama3",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            llama3_tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset_with_weights, process_count=1
+        )
+
+        input_ids = dataset_wrapper[0]["input_ids"]
+
+        # fmt: off
+        # pylint: disable=duplicate-code
+        assert input_ids == [
+            128000,  # bos
+            128006, 9125, 128007,  # system header
+            271, 31724, 128009,  # sys prompt, eot
+            128006, 882, 128007,  # user header
+            271, 15339, 128009,  # user prompt eot
+            128006, 78191, 128007,  # assistant header
+            271, 15339, 128009,   # assistant response eot
+            128006, 882, 128007,
+            271, 11310, 4896, 128009,
+            128006, 78191, 128007,
+            271, 11310, 4896, 128009,
+            128006, 882, 128007,
+            271, 19045, 29474, 128009,
+            128006, 78191, 128007,
+            271, 19045, 29474, 128009,
+        ]
+        # fmt: on
+
+
+class TestSharegptChatML:
+    """
+    Test class for sharegpt prompter
+    """
+
+    def test_no_double_im_end(self, sharegpt_dataset, tokenizer):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset, process_count=1
+        )
+
+        input_ids = dataset_wrapper[0]["input_ids"]
+        # fmt: off
+        assert input_ids == [
+            #  28705, 13, is " \n"
+            1,   # bos
+            32001, 1587, 13, 25997, 32000, 28705, 13,  # system
+            32001, 2188, 13, 21558, 32000, 28705, 13,  # human
+            32001, 13892, 13, 21558, 32000, 28705, 13,  # gpt
+            32001, 2188, 13, 12684, 17664, 32000, 28705, 13,   # human
+            32001, 13892, 13, 12684, 17664, 32000, 28705, 13,  # gpt
+        ]
+        # fmt: on
+
+    def test_no_double_im_end_with_weights(
+        self, sharegpt_dataset_with_weights, tokenizer
+    ):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset_with_weights, process_count=1
+        )
+
+        input_ids = dataset_wrapper[0]["input_ids"]
+        # fmt: off
+        assert input_ids == [
+            #  28705, 13, is " \n"
+            1,   # bos
+            32001, 1587, 13, 25997, 32000, 28705, 13,  # system
+            32001, 2188, 13, 21558, 32000, 28705, 13,  # human
+            32001, 13892, 13, 21558, 32000, 28705, 13,  # gpt
+            32001, 2188, 13, 267, 21558, 32000, 28705, 13,  # human
+            32001, 13892, 13, 267, 21558, 32000, 28705, 13,  # gpt
+            32001, 2188, 13, 12684, 17664, 32000, 28705, 13,   # human
+            32001, 13892, 13, 12684, 17664, 32000, 28705, 13,  # gpt
+        ]
+        # fmt: on
+
+    def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            -100,   # bos
+            -100, -100, -100, -100, -100, -100, -100,  # system
+            -100, -100, -100, -100, -100, -100, -100,  # human
+            -100, -100, 13, 21558, 32000, 28705, 13,  # gpt
+            -100, -100, -100, -100, -100, -100, -100, -100,   # human
+            -100, -100, 13, 12684, 17664, 32000, 28705, 13,  # gpt
+        ]
+        # fmt: on
+
+    def test_no_train_on_input_with_weights(
+        self, sharegpt_dataset_with_weights, tokenizer
+    ):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset_with_weights, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            -100,   # bos
+            -100, -100, -100, -100, -100, -100, -100,  # system
+            -100, -100, -100, -100, -100, -100, -100,  # human
+            -100, -100, -100, -100, -100, -100, -100,  # gpt with weight zero
+            -100, -100, -100, -100, -100, -100, -100, -100,  # human
+            -100, -100, 13, 267, 21558, 32000, 28705, 13,  # gpt
+            -100, -100, -100, -100, -100, -100, -100, -100,   # human
+            -100, -100, -100, -100, -100, -100, -100, -100  # gpt with weight zero
+        ]
+        # fmt: on
+
+    def test_w_train_on_input(self, sharegpt_dataset, tokenizer):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            True,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            1,   # bos
+            32001, 1587, 13, 25997, 32000, 28705, 13,  # system
+            32001, 2188, 13, 21558, 32000, 28705, 13,  # human
+            32001, 13892, 13, 21558, 32000, 28705, 13,  # gpt
+            32001, 2188, 13, 12684, 17664, 32000, 28705, 13,   # human
+            32001, 13892, 13, 12684, 17664, 32000, 28705, 13,  # gpt
+        ]
+        # fmt: on
+
+    def test_w_train_on_input_with_weights(
+        self, sharegpt_dataset_with_weights, tokenizer
+    ):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            True,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset_with_weights, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            1,   # bos
+            32001, 1587, 13, 25997, 32000, 28705, 13,  # system
+            32001, 2188, 13, 21558, 32000, 28705, 13,  # human
+            -100, -100, -100, -100, -100, -100, -100,  # gpt with weight 0
+            -100, -100, -100, -100, -100, -100, -100, -100,  # human with weight 0
+            32001, 13892, 13, 267, 21558, 32000, 28705, 13,  # gpt
+            32001, 2188, 13, 12684, 17664, 32000, 28705, 13,  # human
+            -100, -100, -100, -100, -100, -100, -100, -100  # gpt with weight 0
+        ]
+        # fmt: on
+
+    def test_chatml_glaive(self, glaive_dataset, tokenizer):
+        strategy = GlaiveShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            True,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, glaive_dataset, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            1,  # bos
+            32001, 1587, 13, 3260, 349, 264, 1587, 11510, 32000, 28705, 13,  # system
+            32001, 2188, 13, 6325, 368, 1820, 264, 9314, 354, 528, 477, 1450, 2726, 298, 4222, 28804, 32000, 28705, 13,  # human
+            32001, 13892, 13, 28737, 28742, 28719, 7371, 28725, 562, 315, 949, 28742, 28707, 506, 272, 21368, 298, 1820, 22447, 28723, 28705, 523, 28766, 416, 1009, 772, 28766, 28767, 32000, 28705, 13  # gpt
+        ]
+        # fmt: on
+
+    def test_multi_role_dataset(self, multi_role_dataset, tokenizer):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(conversation="chatml", roles={"input": ["tool"]}),
+            tokenizer,
+            False,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, multi_role_dataset, process_count=1
+        )
+
+        input_ids = dataset_wrapper[0]["input_ids"]
+        # fmt: off
+        assert input_ids == [
+            1,   # bos
+            32001, 1587, 13, 1730, 625, 28730, 769, 1223, 28732, 18373, 28731, 298, 625, 272, 8086, 354, 264, 2990, 32000, 28705, 13,  # system
+            32001, 2188, 13, 21558, 28725, 767, 28742, 28713, 272, 8086, 297, 1450, 2726, 28804, 32000, 28705, 13,  # human
+            32001, 13892, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13,  # gpt
+            32001, 3921, 13, 527, 28730, 769, 1223, 28732, 2972, 2726, 28731, 32000, 28705, 13,  # tool
+            32001, 13892, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13  # gpt
+        ]
+        # fmt: on
+
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            -100,  # bos
+            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # system
+            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # human
+            -100, -100, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13,  # gpt
+            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # tool
+            -100, -100, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13  # gpt
+        ]
+        # fmt: on
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -306,10 +306,6 @@ class TestDatasetPreparation(unittest.TestCase):
        """Verify that processing data from the hub works with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            prepared_path = Path(tmp_dir) / "prepared"
-
-            # make sure prepared_path is empty
-            shutil.rmtree(prepared_path, ignore_errors=True)
-
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
@@ -371,79 +367,44 @@ class TestDatasetPreparation(unittest.TestCase):
    def test_load_local_hub_with_revision(self):
        """Verify that a local copy of a hub dataset can be loaded with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
-            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
-                repo_id="mhenrichsen/alpaca_2k_test",
-                repo_type="dataset",
-                local_dir=tmp_ds_path,
-                revision="d05c1cb",
-            )
+            with tempfile.TemporaryDirectory() as tmp_dir2:
+                tmp_ds_path = Path(tmp_dir2) / "mhenrichsen/alpaca_2k_test"
+                tmp_ds_path.mkdir(parents=True, exist_ok=True)
+                snapshot_download(
+                    repo_id="mhenrichsen/alpaca_2k_test",
+                    repo_type="dataset",
+                    local_dir=tmp_ds_path,
+                    revision="d05c1cb",
+                )

-            prepared_path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 1024,
-                    "datasets": [
-                        {
-                            "path": "mhenrichsen/alpaca_2k_test",
-                            "ds_type": "parquet",
-                            "type": "alpaca",
-                            "data_files": [
-                                f"{tmp_ds_path}/alpaca_2000.parquet",
-                            ],
-                            "revision": "d05c1cb",
-                        },
-                    ],
-                }
-            )
+                prepared_path = Path(tmp_dir) / "prepared"
+                cfg = DictDefault(
+                    {
+                        "tokenizer_config": "huggyllama/llama-7b",
+                        "sequence_len": 1024,
+                        "datasets": [
+                            {
+                                "path": "mhenrichsen/alpaca_2k_test",
+                                "ds_type": "parquet",
+                                "type": "alpaca",
+                                "data_files": [
+                                    f"{tmp_ds_path}/alpaca_2000.parquet",
+                                ],
+                                "revision": "d05c1cb",
+                            },
+                        ],
+                    }
+                )

-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
+                dataset, _ = load_tokenized_prepared_datasets(
+                    self.tokenizer, cfg, prepared_path
+                )

-            assert len(dataset) == 2000
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-            shutil.rmtree(tmp_ds_path)
-
-    def test_loading_local_dataset_folder(self):
-        """Verify that a dataset downloaded to a local folder can be loaded"""
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
-            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
-                repo_id="mhenrichsen/alpaca_2k_test",
-                repo_type="dataset",
-                local_dir=tmp_ds_path,
-            )
-
-            prepared_path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 1024,
-                    "datasets": [
-                        {
-                            "path": str(tmp_ds_path),
-                            "type": "alpaca",
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 2000
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-            shutil.rmtree(tmp_ds_path)
+                assert len(dataset) == 2000
+                assert "input_ids" in dataset.features
+                assert "attention_mask" in dataset.features
+                assert "labels" in dataset.features
+                shutil.rmtree(tmp_ds_path)


 if __name__ == "__main__":
--- a/tests/test_normalize_config.py
+++ b/tests/test_normalize_config.py
@@ -39,12 +39,12 @@ class NormalizeConfigTestCase(unittest.TestCase):
                "datasets": [
                    {
                        "path": "lorem/ipsum",
-                        "type": "chat_template",
-                        "chat_template": "gemma",
+                        "type": "sharegpt",
+                        "conversation": "vicuna_v1.1",
                    },
                    {
                        "path": "sit/amet",
-                        "type": "chat_template",
+                        "type": "sharegpt",
                    },
                ],
            }
@@ -52,8 +52,8 @@ class NormalizeConfigTestCase(unittest.TestCase):

        normalize_cfg_datasets(cfg)

-        assert cfg.datasets[0].chat_template == "gemma"
-        assert cfg.datasets[1].chat_template == "chatml"
+        assert cfg.datasets[0].conversation == "vicuna_v1.1"
+        assert cfg.datasets[1].conversation == "chatml"

    @patch("axolotl.utils.config.is_torch_bf16_gpu_available")
    def test_bf16_auto_setter_available(self, mock_bf16_avail):
--- a/tests/test_packed_pretraining.py
+++ b/tests/test_packed_pretraining.py
@@ -2,7 +2,6 @@
 import functools
 import unittest

-import pytest
 import torch
 from datasets import load_dataset
 from torch.utils.data import DataLoader
@@ -22,7 +21,6 @@ class TestPretrainingPacking(unittest.TestCase):
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.pad_token = "</s>"

-    @pytest.mark.flaky(retries=3, delay=5)
    def test_packing_stream_dataset(self):
        # pylint: disable=duplicate-code
        dataset = load_dataset(
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -3,6 +3,7 @@
 import json
 import logging
 import unittest
+from copy import deepcopy
 from pathlib import Path
 from typing import Optional

@@ -20,8 +21,12 @@ from axolotl.prompt_strategies.llama2_chat import (
    LLama2ChatTokenizingStrategy,
 )
 from axolotl.prompt_strategies.orpo.chat_template import load
-from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompt_strategies.sharegpt import GlaiveShareGPTPromptTokenizingStrategy
+from axolotl.prompt_tokenizers import (
+    AlpacaPromptTokenizingStrategy,
+    ShareGPTPromptTokenizingStrategy,
+)
+from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompterV2
 from axolotl.utils.dict import DictDefault

 LOG = logging.getLogger("axolotl")
@@ -60,6 +65,17 @@ test_data = {
 }


+def prompt_strat(conversation, tokenizer):
+    "Helper function to create a prompt strategy for testing."
+    prompter = ShareGPTPrompterV2(conversation=conversation)
+    return ShareGPTPromptTokenizingStrategy(
+        prompter,
+        tokenizer,
+        False,
+        2048,
+    )
+
+
 class TestPromptTokenizationStrategies(unittest.TestCase):
    """
    Test class for prompt tokenization strategies.
@@ -82,6 +98,196 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
            }
        )

+    def test_sharegpt_integration(self):
+        with open(
+            Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8"
+        ) as fin:
+            data = fin.read()
+            conversation = json.loads(data)
+        with open(
+            Path(__file__).parent / "fixtures/conversation.tokenized.json",
+            encoding="utf-8",
+        ) as fin:
+            data = fin.read()
+            tokenized_conversation = json.loads(data)
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        example = strat.tokenize_prompt(conversation)
+        for fields in ["input_ids", "attention_mask", "labels"]:
+            self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
+            self.assertEqual(example[fields], tokenized_conversation[fields])
+
+    def test_sharegpt_warnings_integration(self):
+        with open(
+            Path(__file__).parent / "fixtures/conversation.missingturns.json",
+            encoding="utf-8",
+        ) as fin:
+            data = fin.read()
+            conversation = json.loads(data)
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            strat.tokenize_prompt(conversation)
+            assert "assistant turn has empty text" in self._caplog.records[1].message
+
+    def test_sharegpt_warnings_turns(self):
+        conversation = {
+            "conversations": [
+                {"from": "system", "value": "lorem"},
+                {"from": "gpt", "value": "ipsum"},
+                {"from": "human", "value": "dolor"},
+                {"from": "human", "value": "dolor"},
+                {"from": "gpt", "value": "sit"},
+            ]
+        }
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            strat.tokenize_prompt(conversation)
+            assert (
+                "Role did not alternate between turns (gpt and human)"
+                in self._caplog.records[0].message
+            )
+
+    def test_sharegpt_llama(self):
+        "Make sure the sharegpt/llama is tokenized and formatted correctly."
+        strat = prompt_strat("llama-2", self.tokenizer)
+
+        def tokenize(conv):
+            return strat.tokenize_prompt(deepcopy(conv))["input_ids"]
+
+        def decode(ids):
+            return strat.tokenizer.decode(ids)
+
+        # fmt: off
+        # System message, multi-turn conversations
+        mt_ids = tokenize(test_data['multi_turn_sys'])
+        assert decode(mt_ids) == '<s> [INST] <<SYS>>\nlorem\n<</SYS>>\n\nabc [/INST] ipsum</s><s> [INST] 123 [/INST] sit</s>'
+        assert mt_ids == [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 29880, 3668, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10736, 518, 29914, 25580, 29962, 23421, 2, 1, 518, 25580, 29962, 29871, 29896, 29906, 29941, 518, 29914, 25580, 29962, 7845, 2]
+
+        # System message, single-turn conversations
+        st_ids = tokenize(test_data['single_turn_sys'])
+        assert decode(st_ids) == '<s> [INST] <<SYS>>\nlorem\n<</SYS>>\n\nabc [/INST] ipsum</s>'
+        assert st_ids == [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 29880, 3668, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10736, 518, 29914, 25580, 29962, 23421, 2]
+
+        # No system message, single-turn
+        ns_ids = tokenize(test_data['single_turn_no_sys'])
+        assert decode(ns_ids) == '<s> [INST] abc [/INST] ipsum</s>'
+        assert ns_ids == [1, 518, 25580, 29962, 25638, 518, 29914, 25580, 29962, 23421, 2]
+
+        # No system message, multi-turn
+        ns_mt_ids = tokenize(test_data['multi_turn_no_sys'])
+        assert decode(ns_mt_ids) == '<s> [INST] abc [/INST] ipsum</s><s> [INST] 123 [/INST] sit</s>'
+        assert ns_mt_ids == [1, 518, 25580, 29962, 25638, 518, 29914, 25580, 29962, 23421, 2, 1, 518, 25580, 29962, 29871, 29896, 29906, 29941, 518, 29914, 25580, 29962, 7845, 2]
+        # fmt: on
+
+    def test_sharegpt_mistral(self):
+        "Make sure the sharegpt/mistral is tokenized and formatted correctly."
+        strat = prompt_strat("mistral", self.tokenizer)
+
+        def tokenize(conv):
+            return strat.tokenize_prompt(deepcopy(conv))["input_ids"]
+
+        def decode(ids):
+            return strat.tokenizer.decode(ids)
+
+        # fmt: off
+        # System message, multi-turn conversations
+        mt_ids = tokenize(test_data['multi_turn_sys'])
+        assert decode(mt_ids) == '<s> [INST]  lorem\nabc [/INST] ipsum</s> [INST] 123 [/INST] sit</s>'
+        assert mt_ids == [1, 518, 25580, 29962, 29871, 301, 3668, 13, 10736, 518, 29914, 25580, 29962, 23421, 2, 518, 25580, 29962, 29871, 29896, 29906, 29941, 518, 29914, 25580, 29962, 7845, 2]
+
+        # System message, single-turn conversations
+        st_ids = tokenize(test_data['single_turn_sys'])
+        assert decode(st_ids) == '<s> [INST]  lorem\nabc [/INST] ipsum</s>'
+        assert st_ids == [1, 518, 25580, 29962, 29871, 301, 3668, 13, 10736, 518, 29914, 25580, 29962, 23421, 2]
+
+        # No system message, single-turn
+        ns_ids = tokenize(test_data['single_turn_no_sys'])
+        assert decode(ns_ids) == '<s> [INST] abc [/INST] ipsum</s>'
+        assert ns_ids == [1, 518, 25580, 29962, 25638, 518, 29914, 25580, 29962, 23421, 2]
+
+        # No system message, multi-turn
+        ns_mt_ids = tokenize(test_data['multi_turn_no_sys'])
+        assert decode(ns_mt_ids) == '<s> [INST] abc [/INST] ipsum</s> [INST] 123 [/INST] sit</s>'
+        assert ns_mt_ids == [1, 518, 25580, 29962, 25638, 518, 29914, 25580, 29962, 23421, 2, 518, 25580, 29962, 29871, 29896, 29906, 29941, 518, 29914, 25580, 29962, 7845, 2]
+        # fmt: on
+
+    def test_sharegpt_changes_roles(self):
+        conversation = {
+            "roles": ["USER", "CHARACTER"],
+            "conversations": [
+                {"from": "system", "value": "lorem"},
+                {"from": "gpt", "value": "ipsum"},
+                {"from": "human", "value": "dolor"},
+                {"from": "gpt", "value": "sit"},
+            ],
+        }
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            res = strat.tokenize_prompt(conversation)
+            assert "CHARACTER" in self.tokenizer.decode(res["input_ids"])
+
+    def test_sharegpt_assistant_label_ignore(self):
+        conversation = {
+            "roles": ["user", "assistant"],
+            "conversations": [
+                {"from": "system", "value": "lorem"},
+                {"from": "gpt", "value": "ipsum"},
+                {"from": "human", "value": "dolor"},
+                {"from": "gpt", "value": "sit"},
+            ],
+        }
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            res = strat.tokenize_prompt(conversation)
+            idx = res["input_ids"].index(20255)  # assistant token
+            assert res["labels"][idx] == -100
+
+    def test_glaive_tool_label_ignore(self):
+        conversation = {
+            "system": "SYSTEM: This is a system prompt",
+            "chat": "USER: Can you book a flight for me from New York to London? ASSISTANT: I'm sorry, but I don't have the capability to book flights.  <|endoftext|>",
+        }
+        prompter = ShareGPTPrompterV2()
+        strat = GlaiveShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            res = strat.tokenize_prompt(conversation)
+            idx = res["input_ids"].index(13566)  # assistant token
+            assert res["labels"][idx] == -100
+
    def test_no_sys_prompt(self):
        """
        tests the interface between the user and assistant parts
--- a/tests/test_schedulers.py
+++ b/tests/test_schedulers.py
@@ -32,19 +32,16 @@ class TestCosineConstantLr(unittest.TestCase):
    def test_schedulers(self):
        self.assertEqual(self.lr_scheduler.get_last_lr()[0], 0)
        for _ in range(self.warmup_steps):
-            self.optimizer.step()
            self.lr_scheduler.step()
        self.assertEqual(self.lr_scheduler.get_last_lr()[0], self._lr)
        constant_step = int(self.train_steps * self.constant_lr_ratio)
        remaining_step = self.train_steps - constant_step
        for _ in range(constant_step):
-            self.optimizer.step()
            self.lr_scheduler.step()
        self.assertEqual(
            self.lr_scheduler.get_last_lr()[0], self._lr * self.min_lr_ratio
        )
        for _ in range(remaining_step):
-            self.optimizer.step()
            self.lr_scheduler.step()
        self.assertEqual(
            self.lr_scheduler.get_last_lr()[0], self._lr * self.min_lr_ratio
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -646,6 +646,39 @@ class TestValidation(BaseValidation):

        validate_config(cfg)

+    def test_sharegpt_deprecation(self, minimal_cfg):
+        cfg = (
+            DictDefault(
+                {"datasets": [{"path": "lorem/ipsum", "type": "sharegpt:chat"}]}
+            )
+            | minimal_cfg
+        )
+        with self._caplog.at_level(logging.WARNING):
+            new_cfg = validate_config(cfg)
+            assert any(
+                "`type: sharegpt:chat` will soon be deprecated." in record.message
+                for record in self._caplog.records
+            )
+        assert new_cfg.datasets[0].type == "sharegpt"
+
+        cfg = (
+            DictDefault(
+                {
+                    "datasets": [
+                        {"path": "lorem/ipsum", "type": "sharegpt_simple:load_role"}
+                    ]
+                }
+            )
+            | minimal_cfg
+        )
+        with self._caplog.at_level(logging.WARNING):
+            new_cfg = validate_config(cfg)
+            assert any(
+                "`type: sharegpt_simple` will soon be deprecated." in record.message
+                for record in self._caplog.records
+            )
+        assert new_cfg.datasets[0].type == "sharegpt:load_role"
+
    def test_no_conflict_save_strategy(self, minimal_cfg):
        cfg = (
            DictDefault(
@@ -726,7 +759,7 @@ class TestValidation(BaseValidation):
        cfg = (
            DictDefault(
                {
-                    "eval_strategy": "epoch",
+                    "evaluation_strategy": "epoch",
                    "eval_steps": 10,
                }
            )
@@ -734,14 +767,14 @@ class TestValidation(BaseValidation):
        )

        with pytest.raises(
-            ValueError, match=r".*eval_strategy and eval_steps mismatch.*"
+            ValueError, match=r".*evaluation_strategy and eval_steps mismatch.*"
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
-                    "eval_strategy": "no",
+                    "evaluation_strategy": "no",
                    "eval_steps": 10,
                }
            )
@@ -749,14 +782,14 @@ class TestValidation(BaseValidation):
        )

        with pytest.raises(
-            ValueError, match=r".*eval_strategy and eval_steps mismatch.*"
+            ValueError, match=r".*evaluation_strategy and eval_steps mismatch.*"
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
-                    "eval_strategy": "steps",
+                    "evaluation_strategy": "steps",
                }
            )
            | minimal_cfg
@@ -767,7 +800,7 @@ class TestValidation(BaseValidation):
        cfg = (
            DictDefault(
                {
-                    "eval_strategy": "steps",
+                    "evaluation_strategy": "steps",
                    "eval_steps": 10,
                }
            )
@@ -790,7 +823,7 @@ class TestValidation(BaseValidation):
        cfg = (
            DictDefault(
                {
-                    "eval_strategy": "no",
+                    "evaluation_strategy": "no",
                }
            )
            | minimal_cfg
@@ -801,7 +834,7 @@ class TestValidation(BaseValidation):
        cfg = (
            DictDefault(
                {
-                    "eval_strategy": "epoch",
+                    "evaluation_strategy": "epoch",
                    "val_set_size": 0,
                }
            )
@@ -810,7 +843,7 @@ class TestValidation(BaseValidation):

        with pytest.raises(
            ValueError,
-            match=r".*eval_steps and eval_strategy are not supported with val_set_size == 0.*",
+            match=r".*eval_steps and evaluation_strategy are not supported with val_set_size == 0.*",
        ):
            validate_config(cfg)

@@ -826,7 +859,7 @@ class TestValidation(BaseValidation):

        with pytest.raises(
            ValueError,
-            match=r".*eval_steps and eval_strategy are not supported with val_set_size == 0.*",
+            match=r".*eval_steps and evaluation_strategy are not supported with val_set_size == 0.*",
        ):
            validate_config(cfg)

@@ -856,7 +889,7 @@ class TestValidation(BaseValidation):
        cfg = (
            DictDefault(
                {
-                    "eval_strategy": "epoch",
+                    "evaluation_strategy": "epoch",
                    "val_set_size": 0.01,
                }
            )
@@ -1095,24 +1128,6 @@ class TestValidation(BaseValidation):
            assert new_cfg["dpo_beta"] is None
            assert len(self._caplog.records) == 1

-    def test_eval_strategy_remap(self, minimal_cfg):
-        cfg = (
-            DictDefault(
-                {
-                    "evaluation_strategy": "steps",
-                }
-            )
-            | minimal_cfg
-        )
-
-        with self._caplog.at_level(logging.WARNING):
-            new_cfg = validate_config(cfg)
-            assert new_cfg.eval_strategy == "steps"
-            assert (
-                "evaluation_strategy is deprecated, use eval_strategy instead"
-                in self._caplog.records[0].message
-            )
-

 class TestValidationCheckModelConfig(BaseValidation):
    """
--- a/tests/test_validation_dataset.py
+++ b/tests/test_validation_dataset.py
@@ -48,8 +48,9 @@ class TestValidationCheckDatasetConfig(BaseValidation):
            | {
                "datasets": [
                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
+                        "path": "LDJnr/Puffin",
+                        "type": "sharegpt",
+                        "conversation": "chatml",
                        "shards": 10,
                    }
                ]
@@ -61,6 +62,7 @@ class TestValidationCheckDatasetConfig(BaseValidation):
        def _check_config():
            assert checked_cfg.datasets[0].path == cfg.datasets[0].path
            assert checked_cfg.datasets[0].type == cfg.datasets[0].type
+            assert checked_cfg.datasets[0].conversation == cfg.datasets[0].conversation
            assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards

        _check_config()
@@ -234,59 +236,3 @@ class TestValidationCheckDatasetConfig(BaseValidation):
        )

        _check_config()
-
-    def test_dataset_sharegpt_deprecation(self, minimal_cfg):
-        cfg = DictDefault(
-            minimal_cfg
-            | {
-                "chat_template": "chatml",
-                "datasets": [
-                    {
-                        "path": "LDJnr/Puffin",
-                        "type": "sharegpt",
-                        "conversation": "chatml",
-                    }
-                ],
-            }
-        )
-
-        # Check sharegpt deprecation is raised
-        with pytest.raises(ValueError, match=r".*type: sharegpt.*` is deprecated.*"):
-            validate_config(cfg)
-
-        # Check that deprecation is not thrown for non-str type
-        cfg = DictDefault(
-            minimal_cfg
-            | {
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": {
-                            "field_instruction": "instruction",
-                            "field_output": "output",
-                            "field_system": "system",
-                            "format": "<|user|> {instruction} {input} <|model|>",
-                            "no_input_format": "<|user|> {instruction} <|model|>",
-                            "system_prompt": "",
-                        },
-                    }
-                ],
-            }
-        )
-
-        validate_config(cfg)
-
-        # Check that deprecation is not thrown for non-sharegpt type
-        cfg = DictDefault(
-            minimal_cfg
-            | {
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    }
-                ],
-            }
-        )
-
-        validate_config(cfg)
Author	SHA1	Message	Date
Wing Lian	efa1209a92	add smoke test training	2024-10-30 15:40:27 -04:00
Wing Lian	67b9e31bbc	make sure to set alternate optimizer and set lr and eps from adam	2024-10-30 15:33:37 -04:00
Wing Lian	ad60916323	add soap optimizer support	2024-10-30 15:33:37 -04:00