add option for liger_pref_rl

import typo
wip for liger dpo integration
2024-12-16 18:31:16 -05:00 · 2024-12-16 14:27:26 -05:00 · 2024-12-16 14:16:36 -05:00 · 2024-12-12 20:17:12 -05:00 · 2024-12-11 20:14:55 -05:00 · 2024-12-11 20:14:24 -05:00
183 changed files with 9685 additions and 5124 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -1,6 +1,16 @@
 name: ci-cd-base

 on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - 'Dockerfile-base'
+      - '.github/workflows/base.yml'
+  pull_request:
+    paths:
+      - 'Dockerfile-base'
+      - '.github/workflows/base.yml'
  workflow_dispatch:

 jobs:
@@ -27,7 +37,7 @@ jobs:
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
-            python_version: "3.11"
+            python_version: "3.10"
            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
@@ -40,23 +50,25 @@ jobs:
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.5.0
+            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
-        uses: docker/metadata-action@v3
+        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-base
+          images: |
+            winglian/axolotl-base
+            axolotlai/axolotl-base
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v4
        with:
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -17,7 +17,7 @@ jobs:
        - name: Set up Quarto
          uses: quarto-dev/quarto-actions/setup@v2
        - name: Setup Python
-          uses: actions/setup-python@v3
+          uses: actions/setup-python@v5
          with:
            python-version: '3.10'
        - name: install dependencies
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -15,9 +15,9 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.0
+      - uses: pre-commit/action@v3.0.1
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,11 +4,13 @@ on:
  push:
    branches:
      - "main"
+    tags:
+      - "v*"
  workflow_dispatch:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -32,7 +34,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.0
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -42,7 +44,12 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl
+          images: |
+            winglian/axolotl
+            axolotlai/axolotl
+          tags: |
+            type=ref,event=branch
+            type=pep440,pattern={{version}}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
@@ -56,7 +63,7 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
+            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
@@ -70,7 +77,7 @@ jobs:

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -94,7 +101,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.0
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -104,20 +111,25 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-cloud
+          images: |
+            winglian/axolotl-cloud
+            axolotlai/axolotl-cloud
+          tags: |
+            type=ref,event=branch
+            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
@@ -128,7 +140,7 @@ jobs:

  build-axolotl-cloud-no-tmux:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -146,20 +158,25 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-cloud-term
+          images: |
+            winglian/axolotl-cloud-term
+            axolotlai/axolotl-cloud-term
+          tags: |
+            type=ref,event=branch
+            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud-no-tmux
          push: ${{ github.event_name != 'pull_request' }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,9 +8,14 @@ on:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday

+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -31,7 +36,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.0
+            pytorch: 2.5.1
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -7,7 +7,7 @@ on:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -31,7 +31,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.0
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -41,7 +41,9 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl
+          images: |
+            winglian/axolotl
+            axolotlai/axolotl
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Set up Docker Buildx
@@ -69,7 +71,7 @@ jobs:

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -93,7 +95,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.0
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -103,7 +105,9 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-cloud
+          images: |
+            winglian/axolotl-cloud
+            axolotlai/axolotl-cloud
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Login to Docker Hub
@@ -112,7 +116,7 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -3,12 +3,27 @@ name: publish pypi
 on:
  push:
    tags:
-      - '*'
+      - 'v*'
+  workflow_dispatch:

 jobs:
+  setup_release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Create release
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: gh release create "$GITHUB_REF_NAME" --generate-notes
  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
+    needs: [setup_release]
    environment:
      name: pypi
      url: https://pypi.org/p/axolotl
@@ -16,17 +31,17 @@ jobs:
      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: "3.10"

      - name: Install dependencies
        run: |
          pip3 install wheel packaging
-          pip3 install -e .
+          pip3 install --no-build-isolation -e .
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Extract tag name
@@ -37,9 +52,9 @@ jobs:
        run: |
          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py

-      - name: Build a binary wheel
+      - name: Build a source dist
        run: |
-          python setup.py sdist bdist_wheel
+          python setup.py sdist

      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -9,12 +9,12 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.0
+      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch

@@ -23,21 +23,32 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
+      max-parallel: 2
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
+        exclude:
+          - python_version: "3.10"
+            pytorch_version: "2.4.1"
+          - python_version: "3.10"
+            pytorch_version: "2.5.1"
    timeout-minutes: 20

    steps:
      - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies

+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging setuptools wheel
+
      - name: Install PyTorch
        run: |
          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
@@ -48,17 +59,29 @@ jobs:
          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
+          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt

      - name: Install dependencies
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging
-          pip3 install -U -e .
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
      - name: Run tests
        run: |
-          pytest --ignore=tests/e2e/ tests/
+          pytest -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest tests/patched/

      - name: cleanup pip cache
        run: |
@@ -82,13 +105,6 @@ jobs:
            num_gpus: 1
            axolotl_extras: mamba-ssm
            nightly_build: "true"
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            num_gpus: 1
-            axolotl_extras: mamba-ssm
-            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -99,7 +115,7 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.0
+            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -8,24 +8,35 @@ on:
      - '**.py'
      - 'requirements.txt'
      - '.github/workflows/*.yml'
+      - 'requirements-tests.txt'
+      - 'cicd/cicd.sh'
+      - 'cicd/Dockerfile.jinja'
  pull_request:
      paths:
       - '**.py'
       - 'requirements.txt'
       - '.github/workflows/*.yml'
+       - 'requirements-tests.txt'
+       - 'cicd/cicd.sh'
+       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:

+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.0
+      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch

@@ -34,17 +45,23 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
+      max-parallel: 2
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
+        exclude:
+          - python_version: "3.10"
+            pytorch_version: "2.4.1"
+          - python_version: "3.10"
+            pytorch_version: "2.5.1"
    timeout-minutes: 20

    steps:
      - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
@@ -61,52 +78,147 @@ jobs:
      - name: Install dependencies
        run: |
          pip3 show torch
-          pip3 install -U -e .
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
      - name: Run tests
        run: |
-          pytest --ignore=tests/e2e/ tests/
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest -v tests/patched/

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+  pytest-sdist:
+    name: PyTest from Source Dist
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.4.1", "2.5.1"]
+    timeout-minutes: 20
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging setuptools setuptools_scm build wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          python -m build --no-isolation --sdist
+          pip3 install --no-build-isolation dist/axolotl*.tar.gz
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Run tests
+        run: |
+          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest -v tests/patched/
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+  docker-e2e-tests-1st:
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
-    needs: [pre-commit, pytest]
+    needs: [pre-commit, pytest, pytest-sdist]

    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.10"
-            pytorch: 2.3.1
-            num_gpus: 1
-            axolotl_extras: mamba-ssm
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            num_gpus: 1
-            axolotl_extras: mamba-ssm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.0
-            num_gpus: 1
-            axolotl_extras:
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==0.63.64 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.tests
+
+  docker-e2e-tests:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    # this job needs to be run on self-hosted GPU runners...
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 90
+    needs: [pre-commit, pytest, docker-e2e-tests-1st]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 121
+            cuda_version: 12.1.1
+            python_version: "3.10"
+            pytorch: 2.3.1
+            num_gpus: 1
+            axolotl_extras: mamba-ssm
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            num_gpus: 1
+            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.gitignore
+++ b/.gitignore
@@ -182,3 +182,6 @@ submit.sh

 typings/
 out/
+
+# vim
+*.swp
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,5 @@
+include requirements.txt
+include README.md
+include LICENSE
+include src/setuptools_axolotl_dynamic_dependencies.py
+recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -1,8 +1,25 @@
-# Axolotl
+<p align="center">
+    <picture>
+        <source media="(prefers-color-scheme: dark)" srcset="image/axolotl_logo_digital_white.svg">
+        <source media="(prefers-color-scheme: light)" srcset="image/axolotl_logo_digital_black.svg">
+        <img alt="Axolotl" src="image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
+    </picture>
+</p>

-![tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg)
-![tests-nightly](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg)
-![multigpu-semi-weekly tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg)
+<p align="center">
+    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
+    <br/>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
+    <img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
+    <br/>
+    <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
+    <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
+    <br/>
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
+</p>

 Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.

@@ -28,9 +45,13 @@ Features:
 ## Table of Contents
 - [Axolotl](#axolotl)
  - [Table of Contents](#table-of-contents)
-  - [Axolotl supports](#axolotl-supports)
  - [Quickstart ⚡](#quickstart-)
-    - [Usage](#usage)
+    - [Edge Builds](#edge-builds-)
+    - [Axolotl CLI Usage](#axolotl-cli-usage)
+  - [Badge ❤🏷️](#badge-️)
+  - [Contributing 🤝](#contributing-)
+  - [Sponsors 🤝❤](#sponsors-)
+  - [Axolotl supports](#axolotl-supports)
  - [Advanced Setup](#advanced-setup)
    - [Environment](#environment)
      - [Docker](#docker)
@@ -62,20 +83,12 @@ Features:
    - [Tokenization Mismatch b/w Inference \& Training](#tokenization-mismatch-bw-inference--training)
  - [Debugging Axolotl](#debugging-axolotl)
  - [Need help? 🙋](#need-help-)
-  - [Badge ❤🏷️](#badge-️)
-  - [Community Showcase](#community-showcase)
-  - [Contributing 🤝](#contributing-)
-  - [Sponsors 🤝❤](#sponsors-)
-      - [💎 Diamond Sponsors - Contact directly](#-diamond-sponsors---contact-directly)
-      - [🥇 Gold Sponsors - $5000/mo](#-gold-sponsors---5000mo)
-      - [🥈 Silver Sponsors - $1000/mo](#-silver-sponsors---1000mo)
-      - [🥉 Bronze Sponsors - $500/mo](#-bronze-sponsors---500mo)

 </td>
 <td>

 <div align="center">
-  <img src="image/axolotl.png" alt="axolotl" width="160">
+  <img src="image/axolotl_symbol_digital_white.svg" alt="axolotl" width="160">
  <div>
    <p>
      <b>Axolotl provides a unified repository for fine-tuning <br />a variety of AI models with ease</b>
@@ -92,6 +105,148 @@ Features:
 </tr>
 </table>

+## Quickstart ⚡
+
+Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.
+
+**Requirements**: *Nvidia* GPU (Ampere architecture or newer for `bf16` and Flash Attention) or *AMD* GPU, Python >=3.10 and PyTorch >=2.3.1.
+
+```bash
+pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+
+# download examples and optionally deepspeed configs to the local path
+axolotl fetch examples
+axolotl fetch deepspeed_configs  # OPTIONAL
+
+# finetune using lora
+axolotl train examples/llama-3/lora-1b.yml
+```
+
+### Edge Builds 🏎️
+
+If you're looking for the latest features and updates between releases, you'll need to install
+from source.
+
+```bash
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+pip3 install packaging ninja
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+```
+
+### Axolotl CLI Usage
+We now support a new, more streamlined CLI using [click](https://click.palletsprojects.com/en/stable/).
+
+```bash
+# preprocess datasets - optional but recommended
+CUDA_VISIBLE_DEVICES="0" axolotl preprocess examples/llama-3/lora-1b.yml
+
+# finetune lora
+axolotl train examples/llama-3/lora-1b.yml
+
+# inference
+axolotl inference examples/llama-3/lora-1b.yml \
+    --lora-model-dir="./outputs/lora-out"
+
+# gradio
+axolotl inference examples/llama-3/lora-1b.yml \
+    --lora-model-dir="./outputs/lora-out" --gradio
+
+# remote yaml files - the yaml config can be hosted on a public URL
+# Note: the yaml config must directly link to the **raw** yaml
+axolotl train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml
+```
+
+We've also added a new command for fetching `examples` and `deepspeed_configs` to your
+local machine. This will come in handy when installing `axolotl` from PyPI.
+
+```bash
+# Fetch example YAML files (stores in "examples/" folder)
+axolotl fetch examples
+
+# Fetch deepspeed config files (stores in "deepspeed_configs/" folder)
+axolotl fetch deepspeed_configs
+
+# Optionally, specify a destination folder
+axolotl fetch examples --dest path/to/folder
+```
+
+### Legacy Usage
+<details>
+
+<summary>Click to Expand</summary>
+
+While the Axolotl CLI is the preferred method for interacting with axolotl, we
+still support the legacy `-m axolotl.cli.*` usage.
+
+```bash
+# preprocess datasets - optional but recommended
+CUDA_VISIBLE_DEVICES="0" python -m axolotl.cli.preprocess examples/llama-3/lora-1b.yml
+
+# finetune lora
+accelerate launch -m axolotl.cli.train examples/llama-3/lora-1b.yml
+
+# inference
+accelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \
+    --lora_model_dir="./outputs/lora-out"
+
+# gradio
+accelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \
+    --lora_model_dir="./outputs/lora-out" --gradio
+
+# remote yaml files - the yaml config can be hosted on a public URL
+# Note: the yaml config must directly link to the **raw** yaml
+accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml
+```
+
+</details>
+
+## Badge ❤🏷️
+
+Building something cool with Axolotl? Consider adding a badge to your model card.
+
+```markdown
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+```
+
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+
+## Sponsors 🤝❤
+
+If you love axolotl, consider sponsoring the project by reaching out directly to [wing@axolotl.ai](mailto:wing@axolotl.ai).
+
+---
+
+- [Modal](https://modal.com/) Modal lets you run data/AI jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale, fine-tune LLM models, run protein folding simulations, and much more.
+
+---
+
+## Contributing 🤝
+
+Please read the [contributing guide](./.github/CONTRIBUTING.md)
+
+Bugs? Please check the [open issues](https://github.com/axolotl-ai-cloud/axolotl/issues/bug) else create a new Issue.
+
+PRs are **greatly welcome**!
+
+Please run the quickstart instructions followed by the below to setup env:
+```bash
+pip3 install -r requirements-dev.txt -r requirements-tests.txt
+pre-commit install
+
+# test
+pytest tests/
+
+# optional: run against all files
+pre-commit run --all-files
+```
+
+Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
+
+<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
+</a>
+
 ## Axolotl supports

 |             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
@@ -117,41 +272,6 @@ Features:
 ❌: not supported
 ❓: untested

-## Quickstart ⚡
-
-Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.
-
-**Requirements**: Nvidia GPU (Ampere architecture or newer for `bf16` and Flash Attention), Python >=3.10 and PyTorch >=2.3.1.
-
-```bash
-git clone https://github.com/axolotl-ai-cloud/axolotl
-cd axolotl
-
-pip3 install packaging ninja
-pip3 install -e '.[flash-attn,deepspeed]'
-```
-
-### Usage
-```bash
-# preprocess datasets - optional but recommended
-CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/openllama-3b/lora.yml
-
-# finetune lora
-accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
-
-# inference
-accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
-    --lora_model_dir="./outputs/lora-out"
-
-# gradio
-accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
-    --lora_model_dir="./outputs/lora-out" --gradio
-
-# remote yaml files - the yaml config can be hosted on a public URL
-# Note: the yaml config must directly link to the **raw** yaml
-accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/openllama-3b/lora.yml
-```
-
 ## Advanced Setup

 ### Environment
@@ -159,7 +279,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
 #### Docker

  ```bash
-  docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
+  docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
  ```

  Or run on the current files for development:
@@ -178,7 +298,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
  A more powerful Docker command to run would be this:

  ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-latest
  ```

  It additionally:
@@ -200,7 +320,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
  3. Install Axolotl along with python dependencies
        ```bash
        pip3 install packaging
-        pip3 install -e '.[flash-attn,deepspeed]'
+        pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
        ```
  4. (Optional) Login to Huggingface to use gated models/datasets.
        ```bash
@@ -210,7 +330,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --

 #### Cloud GPU

-For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)
+For cloud GPU providers that support docker images, use [`axolotlai/axolotl-cloud:main-latest`](https://hub.docker.com/r/axolotlai/axolotl-cloud/tags)

 - on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
 - on JarvisLabs.ai use this [direct link](https://jarvislabs.ai/templates/axolotl)
@@ -279,7 +399,7 @@ Please use WSL or Docker!

 Use the below instead of the install method in QuickStart.
 ```
-pip3 install -e '.'
+pip3 install --no-build-isolation -e '.'
 ```
 More info: [mac.md](/docs/mac.qmd)

@@ -319,7 +439,7 @@ Write a job description in YAML as below:
 # dstack.yaml
 type: task

-image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2
+image: axolotlai/axolotl-cloud:main-latest

 env:
  - HUGGING_FACE_HUB_TOKEN
@@ -383,11 +503,10 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
        - typescript
      type: ... # unimplemented custom format

-      # fastchat conversation (deprecation soon, use chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template)
-      # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+      # chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template
    - path: ...
-      type: sharegpt
-      conversation: chatml # default: vicuna_v1.1
+      type: chat_template
+      chat_template: chatml # defaults to tokenizer's chat_template

      # local
    - path: data.jsonl # or json
@@ -562,7 +681,8 @@ plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
-liger_swiglu: true
+liger_glu_activation: true
+liger_layer_norm: true
 liger_fused_linear_cross_entropy: true
 ```

@@ -669,86 +789,6 @@ See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, al

 ## Need help? 🙋

-Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we our community members can help you.
+Join our [Discord server](https://discord.gg/HhrNrHJPRb) where our community members can help you.

-Need dedicated support? Please contact us at [✉️wing@openaccessaicollective.org](mailto:wing@openaccessaicollective.org) for dedicated support options.
-
-## Badge ❤🏷️
-
-Building something cool with Axolotl? Consider adding a badge to your model card.
-
-```markdown
-[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
-```
-
-[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
-
-## Community Showcase
-
-Check out some of the projects and models that have been built using Axolotl! Have a model you'd like to add to our Community Showcase? Open a PR with your model.
-
-Open Access AI Collective
- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b-fixed)
- [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
- [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)
-
-PocketDoc Labs
- [Dan's PersonalityEngine 13b LoRA](https://huggingface.co/PocketDoc/Dans-PersonalityEngine-13b-LoRA)
-
-## Contributing 🤝
-
-Please read the [contributing guide](./.github/CONTRIBUTING.md)
-
-Bugs? Please check the [open issues](https://github.com/axolotl-ai-cloud/axolotl/issues/bug) else create a new Issue.
-
-PRs are **greatly welcome**!
-
-Please run the quickstart instructions followed by the below to setup env:
-```bash
-pip3 install -r requirements-dev.txt -r requirements-tests.txt
-pre-commit install
-
-# test
-pytest tests/
-
-# optional: run against all files
-pre-commit run --all-files
-```
-
-Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
-
-<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
-</a>
-
-## Sponsors 🤝❤
-
-OpenAccess AI Collective is run by volunteer contributors such as [winglian](https://github.com/winglian),
-[NanoCode012](https://github.com/NanoCode012), [tmm1](https://github.com/tmm1),
-[mhenrichsen](https://github.com/mhenrichsen), [casper-hansen](https://github.com/casper-hansen),
-[hamelsmu](https://github.com/hamelsmu) and many more who help us accelerate forward by fixing bugs, answering
-community questions and implementing new features. Axolotl needs donations from sponsors for the compute needed to
-run our unit & integration tests, troubleshooting community issues, and providing bounties. If you love axolotl,
-consider sponsoring the project via [GitHub Sponsors](https://github.com/sponsors/OpenAccess-AI-Collective),
-[Ko-fi](https://ko-fi.com/axolotl_ai) or reach out directly to
-[wing@openaccessaicollective.org](mailto:wing@openaccessaicollective.org).
-
---
-
-#### 💎 Diamond Sponsors - [Contact directly](mailto:wing@openaccessaicollective.org)
-
---
-
-#### 🥇 Gold Sponsors - $5000/mo
-
---
-
-#### 🥈 Silver Sponsors - $1000/mo
-
---
-
-#### 🥉 Bronze Sponsors - $500/mo
-
- - [JarvisLabs.ai](https://jarvislabs.ai)
-
---
+Need dedicated support? Please contact us at [✉️wing@axolotl.ai](ailto:wing@axolotl.ai) for dedicated support options.
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,10 +1,9 @@
-FROM winglian/axolotl-base:{{ BASE_TAG }}
+FROM axolotlai/axolotl-base:{{ BASE_TAG }}

 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
 ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
 ENV CUDA="{{ CUDA }}"
-ENV BNB_CUDA_VERSION="{{ CUDA }}"
 ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
 ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
@@ -28,14 +27,18 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
    fi

+RUN python scripts/unsloth_install.py | sh
+RUN python scripts/cutcrossentropy_install.py | sh
+
 # So we can test the Docker image
 RUN pip install -r requirements-dev.txt -r requirements-tests.txt

--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,6 +1,10 @@
 #!/bin/bash
 set -e

-pytest -n4 --ignore=tests/e2e/ /workspace/axolotl/tests/
-pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/
-pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
+python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
+
+pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
+# pytest -v --durations=10 -n8 --dist loadfile /workspace/axolotl/tests/patched/
+pytest -v --durations=10 -n1 --dist loadfile /workspace/axolotl/tests/e2e/patched/
+pytest -v --durations=10 -n1 --dist loadfile /workspace/axolotl/tests/e2e/integrations/
+pytest -v --durations=10 --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import Image, Stub
+from modal import App, Image

 cicd_path = pathlib.Path(__file__).parent.resolve()

@@ -46,7 +46,7 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )

-stub = Stub("Axolotl CI/CD", secrets=[])
+app = App("Axolotl CI/CD", secrets=[])


 N_GPUS = int(os.environ.get("N_GPUS", 2))
@@ -61,7 +61,7 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit


-@stub.function(
+@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=60 * 60,
@@ -72,6 +72,6 @@ def cicd_pytest():
    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")


-@stub.local_entrypoint()
+@app.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,4 +2,4 @@
 set -e

 # only run one test at a time so as not to OOM the GPU
-pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
+pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import Image, Stub
+from modal import App, Image

 cicd_path = pathlib.Path(__file__).parent.resolve()

@@ -40,6 +40,7 @@ with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
 cicd_image = (
    Image.from_dockerfile(
        pathlib.Path(temp_dir) / "Dockerfile",
+        context_mount=None,
        force_build=True,
        gpu="A10G",
    )
@@ -47,7 +48,7 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )

-stub = Stub("Axolotl CI/CD", secrets=[])
+app = App("Axolotl CI/CD", secrets=[])


 N_GPUS = int(os.environ.get("N_GPUS", 1))
@@ -62,7 +63,7 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit


-@stub.function(
+@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=60 * 60,
@@ -73,6 +74,6 @@ def cicd_pytest():
    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")


-@stub.local_entrypoint()
+@app.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -1,4 +1,4 @@
-# Example config for debugging the sharegpt prompt format
+# Example config for debugging the chat_template prompt format
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,11 +1,10 @@
 ARG BASE_TAG=main-base
-FROM winglian/axolotl-base:$BASE_TAG
+FROM axolotlai/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
 ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
-ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.1.2"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION
@@ -21,11 +20,14 @@ WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
    fi

+RUN python scripts/unsloth_install.py | sh
+RUN python scripts/cutcrossentropy_install.py | sh
+
 # So we can test the Docker image
 RUN pip install pytest

--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -16,7 +16,7 @@ ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

 RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
@@ -29,7 +29,9 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
+    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
+    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -1,8 +1,8 @@
 ARG BASE_TAG=main
-FROM winglian/axolotl:$BASE_TAG
+FROM axolotlai/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"

--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -1,8 +1,8 @@
 ARG BASE_TAG=main
-FROM winglian/axolotl:$BASE_TAG
+FROM axolotlai/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"

--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -1,11 +1,10 @@
 ARG BASE_TAG=main-base
-FROM winglian/axolotl-base:$BASE_TAG
+FROM axolotlai/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
 ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
-ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.1.2"
 ARG GITHUB_REF="main"

@@ -25,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -52,7 +52,7 @@ export GPU_ARCHS="gfx90a"
 cd flash-attention
 export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
 patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
-pip install .
+pip install --no-build-isolation .
 ```

 ### 6. Install Axolotl
@@ -63,7 +63,7 @@ Clone and install Axolotl:
 git clone https://github.com/axolotl-ai-cloud/axolotl
 cd axolotl
 pip install packaging ninja
-pip install -e .
+pip install --no-build-isolation -e .
 ```

 ### 7. Apply xformers Workaround
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -83,7 +83,7 @@ lora_on_cpu: true
 datasets:
  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
-    # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
    data_files: # Optional[str] path to source data files
@@ -91,15 +91,7 @@ datasets:
    name: # Optional[str] name of dataset configuration to load
    train_on_split: train # Optional[str] name of dataset split to load from
    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
-
-    # Optional[str] fastchat conversation type, only used with type: sharegpt
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].
+    trust_remote_code: # Optional[bool] Trust remote code for untrusted source

  # Custom user instruction prompt
  - path: repo
@@ -170,6 +162,9 @@ datasets:
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true

+Deduplicates datasets and test_datasets with identical entries.
+dataset_exact_deduplication: true
+
 # A list of one or more datasets to eval the model with.
 # You can use either test_datasets, or val_set_size, but not both.
 test_datasets:
@@ -183,6 +178,8 @@ test_datasets:

 # use RL training: 'dpo', 'ipo', 'kto'
 rl:
+# whether to perform weighting if doing DPO training. Boolean.
+dpo_use_weighting:

 # The name of the chat template to use for training, following values are supported:
 # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
@@ -412,6 +409,7 @@ lr_div_factor: # Learning rate div factor
 # - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_apex_fused
+# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
 # - adafactor
 # - adamw_anyprecision
 # - sgd
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -6,33 +6,8 @@ order: 3

 ## sharegpt

-UPDATE: ShareGPT is being deprecated in the next release. Please see `chat_template` section below.
+IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.

-conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"from": "...", "value": "..."}]}
-```
-
-Note: `type: sharegpt` opens special configs:
- `conversation`: enables conversions to many Conversation types. Refer to the 'name' [here](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) for options.
- `roles`: allows you to specify the roles for input and output. This is useful for datasets with custom roles such as `tool` etc to support masking.
- `field_human`: specify the key to use instead of `human` in the conversation.
- `field_model`: specify the key to use instead of `gpt` in the conversation.
-
-```yaml
-datasets:
-    path: ...
-    type: sharegpt
-
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].
-```

 ## pygmalion

@@ -40,38 +15,6 @@ datasets:
 {"conversations": [{"role": "...", "value": "..."}]}
 ```

-## sharegpt.load_role
-
-conversations where `role` is used instead of `from`
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "value": "..."}]}
-```
-
-## sharegpt.load_guanaco
-
-conversations where `from` is `prompter` `assistant` instead of default sharegpt
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"from": "...", "value": "..."}]}
-```
-
-## sharegpt.load_ultrachat
-
-conversations where the turns field is 'messages', human is 'user' and gpt is 'assistant'.
-
-```{.json filename="data.jsonl"}
-{"messages": [{"user": "...", "assistant": "..."}]}
-```
-
-## sharegpt_jokes
-
-creates a chat where bot is asked to tell a joke, then explain why the joke is funny
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
-```
-

 ## chat_template

--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -71,7 +71,7 @@ Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/us

 ```bash
 pip3 install packaging
-pip3 install -e '.[flash-attn,deepspeed]'
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 #### Remote Hosts
@@ -185,7 +185,7 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3

 ## Debugging With Docker

-Using [official Axolotl Docker images](https://hub.docker.com/r/winglian/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.
+Using [official Axolotl Docker images](https://hub.docker.com/r/axolotlai/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.

 ### Setup

@@ -202,17 +202,17 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
 ```

 >[!Tip]
-> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/winglian/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
+> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

 You will now be in the container.  Next, perform an editable install of Axolotl:

 ```bash
 pip3 install packaging
-pip3 install -e '.[flash-attn,deepspeed]'
+pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

 ### Attach To Container
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -52,6 +52,26 @@ datasets:
    type: chat_template.argilla
 ```

+
+#### KTO
+
+```yaml
+rl: kto
+rl_beta: 0.5
+kto_desirable_weight: 0.2
+
+remove_unused_columns: false
+
+datasets:
+  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
+    type: llama3.ultra
+    split: train
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+```
+
 #### Using local dataset files
 ```yaml
 datasets:
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -11,12 +11,10 @@ standard industry baselines.

 ### Installation

-The following will install unsloth from source and downgrade xformers as unsloth is incompatible with the most up
-to date libraries.
+The following will install the correct unsloth and extras from source.

 ```bash
-pip install --no-deps "unsloth @ git+https://github.com/unslothai/unsloth.git"
-pip install --no-deps --force-reinstall xformers==0.0.26.post1
+python scripts/unsloth_install.py | sh
 ```

 ### Using unsloth w Axolotl
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -2,19 +2,15 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "metadata": {
-    "id": "AKjdG7tbTb-n"
-   },
+   "metadata": {},
   "source": [
-    "# Example notebook for running Axolotl on google colab"
+    "## Setting up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "id": "RcbNpOgWRcii"
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
@@ -22,82 +18,76 @@
    "assert (torch.cuda.is_available()==True)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "h3nLav8oTRA5"
-   },
-   "source": [
-    "## Install Axolotl and dependencies"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "3c3yGAwnOIdi",
-    "outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
-    "!pip install flash-attn==\"2.5.0\"\n",
-    "!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
+    "!pip install --no-build-isolation axolotl[deepspeed]"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "id": "BW2MFr7HTjub"
-   },
+   "metadata": {},
   "source": [
-    "## Create an yaml config file"
+    "## Hugging Face login (optional)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "id": "9pkF2dSoQEUN"
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "import yaml\n",
    "\n",
-    "# Your YAML string\n",
    "yaml_string = \"\"\"\n",
-    "base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
-    "model_type: LlamaForCausalLM\n",
-    "tokenizer_type: LlamaTokenizer\n",
+    "base_model: NousResearch/Meta-Llama-3.1-8B\n",
    "\n",
    "load_in_8bit: false\n",
    "load_in_4bit: true\n",
    "strict: false\n",
    "\n",
    "datasets:\n",
-    "  - path: mhenrichsen/alpaca_2k_test\n",
+    "  - path: tatsu-lab/alpaca\n",
    "    type: alpaca\n",
-    "dataset_prepared_path:\n",
+    "dataset_prepared_path: last_run_prepared\n",
    "val_set_size: 0.05\n",
-    "output_dir: ./outputs/qlora-out\n",
+    "output_dir: ./outputs/lora-out\n",
+    "\n",
+    "sequence_len: 2048\n",
+    "sample_packing: true\n",
+    "eval_sample_packing: true\n",
+    "pad_to_sequence_len: true\n",
    "\n",
    "adapter: qlora\n",
    "lora_model_dir:\n",
-    "\n",
-    "sequence_len: 4096\n",
-    "sample_packing: true\n",
-    "eval_sample_packing: false\n",
-    "pad_to_sequence_len: true\n",
-    "\n",
    "lora_r: 32\n",
    "lora_alpha: 16\n",
    "lora_dropout: 0.05\n",
-    "lora_target_modules:\n",
    "lora_target_linear: true\n",
    "lora_fan_in_fan_out:\n",
+    "lora_modules_to_save:\n",
+    "  - embed_tokens\n",
+    "  - lm_head\n",
    "\n",
    "wandb_project:\n",
    "wandb_entity:\n",
@@ -105,12 +95,12 @@
    "wandb_name:\n",
    "wandb_log_model:\n",
    "\n",
-    "gradient_accumulation_steps: 4\n",
-    "micro_batch_size: 2\n",
-    "num_epochs: 4\n",
-    "optimizer: paged_adamw_32bit\n",
+    "gradient_accumulation_steps: 2\n",
+    "micro_batch_size: 1\n",
+    "num_epochs: 1\n",
+    "optimizer: paged_adamw_8bit\n",
    "lr_scheduler: cosine\n",
-    "learning_rate: 0.0002\n",
+    "learning_rate: 2e-5\n",
    "\n",
    "train_on_inputs: false\n",
    "group_by_length: false\n",
@@ -121,13 +111,15 @@
    "gradient_checkpointing: true\n",
    "early_stopping_patience:\n",
    "resume_from_checkpoint:\n",
-    "local_rank:\n",
    "logging_steps: 1\n",
    "xformers_attention:\n",
-    "flash_attention: true\n",
+    "flash_attention: false\n",
+    "sdp_attention: true\n",
    "\n",
-    "warmup_steps: 10\n",
-    "evals_per_epoch: 4\n",
+    "warmup_steps: 1\n",
+    "max_steps: 25\n",
+    "evals_per_epoch: 1\n",
+    "eval_table_size:\n",
    "saves_per_epoch: 1\n",
    "debug:\n",
    "deepspeed:\n",
@@ -135,9 +127,10 @@
    "fsdp:\n",
    "fsdp_config:\n",
    "special_tokens:\n",
-    "\n",
+    "  pad_token: <|end_of_text|>\n",
    "\"\"\"\n",
    "\n",
+    "\n",
    "# Convert the YAML string to a Python dictionary\n",
    "yaml_dict = yaml.safe_load(yaml_string)\n",
    "\n",
@@ -146,31 +139,124 @@
    "\n",
    "# Write the YAML file\n",
    "with open(file_path, 'w') as file:\n",
-    "    yaml.dump(yaml_dict, file)\n"
+    "    yaml.dump(yaml_dict, file)"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "id": "bidoj8YLTusD"
-   },
+   "metadata": {},
   "source": [
-    "## Launch the training"
+    "Above we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\n",
+    "\n",
+    "The Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let's go through them line by line:\n",
+    "\n",
+    "*   \"base model\": String value, specifies the underlying pre-trained LLM that will be used for finetuning\n",
+    "\n",
+    "Next we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n",
+    "\n",
+    "*   \"load_in_8bit\": Boolean value, whether to quantize the model weights into 8-bit integer.\n",
+    "\n",
+    "*   \"load_in_4bit\": Boolean value, whether to quantize the model weights into 4-bit integer.\n",
+    "\n",
+    "*   \"strict\": Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n",
+    "\n",
+    "*   \"datasets\": a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n",
+    "\n",
+    "*   \"val_set_size\": Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n",
+    "\n",
+    "*   \"output_dir\": String value. Path of trained model.\n",
+    "\n",
+    "For data preprocessing:\n",
+    "\n",
+    "*   \"sequence_len\": Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n",
+    "\n",
+    "*   \"pad_to_sequence_len\": Boolean. Padding input to maximum sequence length.\n",
+    "\n",
+    "*   \"sample_packing\": Boolean. Specifies whether to use multi-packing with block diagonal attention.\n",
+    "\n",
+    "*   \"special_tokens\": Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n",
+    "\n",
+    "For LoRA configuration and its hyperparamters:\n",
+    "\n",
+    "*   \"adapter\": String. Either \"lora\" or \"qlora\", depending on user's choice.\n",
+    "\n",
+    "*   \"lora_model_dir\": String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n",
+    "\n",
+    "*   \"lora_r\": Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n",
+    "\n",
+    "*   \"lora_alpha\": Integer. Scale the weight matrices by $\\frac{\\text{lora_alpha}}{\\text{lora_r}}$Recommended to be fixed at 16.\n",
+    "\n",
+    "*   \"lora_dropout\": Float that is 1 or less. The dropout probability of a lora layer.\n",
+    "\n",
+    "*   \"lora_target_linear\": Boolean. If true, lora will target all linear modules in the transformers architecture.\n",
+    "\n",
+    "*   \"lora_modules_to_save\": If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n",
+    "\n",
+    "See [LoRA](https://arxiv.org/abs/2106.09685) for detailed explanation of LoRA implementation.\n",
+    "\n",
+    "For the training configurations:\n",
+    "\n",
+    "*   \"gradient_accumulation_steps\": Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n",
+    "\n",
+    "*   \"micro_batch_size\": Integer. Batch size per gpu / gradient_accumulation_steps\n",
+    "\n",
+    "*   \"num_epochs\": Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n",
+    "\n",
+    "*   \"optimizer\": The optimizer to use for the training.\n",
+    "\n",
+    "*   \"learning_rate\": The learning rate.\n",
+    "\n",
+    "*   \"lr_scheduler\": The learning rate scheduler to use for adjusting learning rate during training.\n",
+    "\n",
+    "*   \"train_on_inputs\": Boolean. Whether to ignore or include the user's prompt from the training labels.\n",
+    "\n",
+    "*   \"group_by_length\": Boolean. Whether to group similarly sized data to minimize padding.\n",
+    "\n",
+    "*   \"bf16\": Either \"auto\", \"true\", or \"false\". Whether to use CUDA bf16 floating point format. If set to \"auto\", will automatically apply bf16 should the gpu supports it.\n",
+    "\n",
+    "*   \"fp16\": Optional. Specifies whether to use CUDA fp16. Automatically set to true if \"bf16\" is set to true. Otherwise false.\n",
+    "\n",
+    "*   \"tf32\": Boolean. Whether to use CUDA tf32. Will override bf16.\n",
+    "\n",
+    "*   \"gradient_checkpointing\": Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n",
+    "\n",
+    "*   \"gradient_checkpointing_kwargs\": Python Dict. Fed into the trainer.\n",
+    "\n",
+    "*   \"logging_steps\": Integer. Log training information over every specified number of steps.\n",
+    "\n",
+    "*   \"flash_attention\": Boolean. Whether to use the [flash attention](https://github.com/Dao-AILab/flash-attention) mechanism.\n",
+    "\n",
+    "*   \"sdp_attention\": Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the [original implementation](https://arxiv.org/abs/1706.03762) of transformers.)\n",
+    "\n",
+    "*   \"warmup_steps\": Integer. The number of pre-training steps where a very low learning rate is used.\n",
+    "\n",
+    "*   \"evals_per_epoch\": Integer. Number of evaluations to be performed within one training epoch.\n",
+    "\n",
+    "*   \"saves_per_epoch\": Integer. Number of times the model is saved in one training epoch.\n",
+    "\n",
+    "*   \"weight_decay\": Positive Float. Sets the \"strength\" of weight decay (i.e. setting the coefficient of L2 regularization)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see [here](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "ydTI2Jk2RStU",
-    "outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "# By using the ! the comand will be executed as a bash command\n",
    "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
   ]
  },
@@ -178,7 +264,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Play with inference"
+    "Predict with trained model"
   ]
  },
  {
@@ -187,36 +273,85 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# By using the ! the comand will be executed as a bash command\n",
    "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
-    "    --qlora_model_dir=\"./qlora-out\" --gradio"
+    "    --lora_model_dir=\"./outputs/lora-out\" --gradio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Deeper Dive"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It is also helpful to gain some familiarity over some of the core inner workings of axolotl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration Normalization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Axolotl uses a custom Dict class, called ```DictDefault```\n",
+    "to store configurations specified in the yaml configuration file (into a Python variable named ```cfg```). The definition for this custom Dict can be found in the [utils/dict.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/dict.py)\n",
+    "\n",
+    "```DictDefault``` is amended such that calling a missing key from it will result in a ```None``` return type. This is important because if some configuration options aren't specified by the user, the ```None``` type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out [utils/config/__init__.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/__init__.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading Models, Tokenizers, and Trainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we inspect [cli.train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/cli/train.py), we will find that most of the heavy lifting were done by the function ```train()``` which is itself imported from [src/axolotl/train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/train.py).\n",
+    "\n",
+    "```train()``` takes care of loading the appropriate tokenizer and pre-trained model through ```load_model()``` and ```load_tokenizer()``` from [src/axolotl/utils/models.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/models.py) respectively.\n",
+    "\n",
+    "```load_tokenizer()``` loads in the appropriate tokenizer given the desired model, as well as chat templates.\n",
+    "\n",
+    "```ModelLoader``` class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ```ModelLoader``` will utilize the corresponding \"attention hijacking\" script. For example, if the user specified the base model to be ```NousResearch/Meta-Llama-3.1-8B```, which is of llama type, and set ```flash_attn``` to ```True```, ```ModelLoader``` will load in [llama_attn_hijack_flash.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/monkeypatch/llama_attn_hijack_flash.py). For a list of supported attention hijacking, please refer to the directory [/src/axolotl/monkeypatch/](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch)\n",
+    "\n",
+    "Another important operation encompassed in ```train()``` is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of ```setup_trainer()``` from [/src/axolotl/utils/trainer.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/trainer.py), which in turn relies on modules from [/src/axolotl/core/trainer_builder.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/core/trainer_builder.py).\n",
+    "```trainer_builder.py``` provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning ('dpo', 'ipo', 'kto') )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Monkey patch\n",
+    "\n",
+    "The [Monkey patch directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch) is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
   ]
  }
 ],
 "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 2
 }
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -9,7 +9,7 @@ strict: false
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rms_norm: true
-liger_swiglu: true
+liger_glu_activation: true
 liger_fused_linear_cross_entropy: true

 chat_template: deepseek_v2
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -4,7 +4,7 @@ plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
-liger_swiglu: true
+liger_glu_activation: true
 liger_fused_linear_cross_entropy: true

 strict: false
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -0,0 +1,95 @@
+base_model: meta-llama/Llama-3.2-1B
+model_type: LlamaForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+chat_template: llama3
+rl: dpo
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_dpo_test
+    type: chat_template.default
+    field_messages: conversation
+    field_chosen: chosen
+    field_rejected: rejected
+    message_field_role: role
+    message_field_content: content
+    roles:
+      system:
+        - system
+      user:
+        - user
+      assistant:
+        - assistant
+  - path: fozziethebeat/alpaca_messages_2k_dpo_test
+    type: chat_template.default
+    field_messages: conversation
+    field_chosen: chosen
+    field_rejected: rejected
+    message_field_role: role
+    message_field_content: content
+    roles:
+      system:
+        - system
+      user:
+        - user
+      assistant:
+        - assistant
+
+dataset_exact_deduplication: true
+dataset_prepared_path:
+val_set_size: 0
+output_dir: ./outputs/lora-out
+
+sequence_len: 4096
+sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -0,0 +1,76 @@
+base_model: meta-llama/Llama-3.2-1B
+model_type: LlamaForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./outputs/lora-out
+
+dataset_exact_deduplication: true
+test_value: true
+
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_modules_to_save:
+  - embed_tokens
+  - lm_head
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -0,0 +1,74 @@
+base_model: NousResearch/Llama-3.2-1B
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_fan_in_fan_out:
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -0,0 +1,75 @@
+base_model: meta-llama/Llama-3.2-1B
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+rl: kto
+rl_beta: 0.5
+kto_desirable_weight: 0.2
+
+datasets:
+  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
+    type: llama3.ultra
+    split: train
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/qlora-out
+
+remove_unused_columns: false
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: false  # not supported with kto
+eval_sample_packing: false
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 20
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -1,4 +1,4 @@
-base_model: meta-llama/Llama-3.2-1B
+base_model: NousResearch/Llama-3.2-1B

 load_in_8bit: false
 load_in_4bit: true
@@ -22,7 +22,6 @@ pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_linear: true
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -0,0 +1,93 @@
+#Note that we are switching from the regular chat template to chatml.
+#If you experience problems with the special tokens, training for more epochs can help.
+#After training, merge the model before inference otherwise you might
+#face problems with the special tokens.
+
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+model_type: MistralForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+chat_template: chatml
+rl: dpo
+datasets:
+  - path: olivermolenschot/alpaca_messages_dpo_test
+    type: chat_template.default
+    field_messages: conversation
+    field_chosen: chosen
+    field_rejected: rejected
+    message_field_role: role
+    message_field_content: content
+
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/dpo-qlora
+
+sequence_len: 2048
+sample_packing: false
+pad_to_sequence_len: true
+
+adapter: qlora
+lora_model_dir:
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.2
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+lora_modules_to_save:
+ - embed_tokens
+ - lm_head
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 16
+num_epochs: 6
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: false
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<|im_start|>"
+  eos_token: "<|im_end|>"
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -0,0 +1,67 @@
+base_model: Qwen/Qwen2.5-0.5B
+
+strict: false
+
+chat_template: qwen_25
+rl: dpo
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_dpo_test
+    type: chat_template.default
+    field_messages: conversation
+    field_chosen: chosen
+    field_rejected: rejected
+    message_field_role: role
+    message_field_content: content
+    roles:
+      system:
+        - system
+      user:
+        - user
+      assistant:
+        - assistant
+
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./outputs/dpo-out
+
+sequence_len: 2048
+sample_packing: false
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/image/axolotl-badge-web-legacy.png
+++ b/image/axolotl-badge-web-legacy.png
--- a/image/axolotl-badge-web.png
+++ b/image/axolotl-badge-web.png
--- a/image/axolotl_logo_digital_black.svg
+++ b/image/axolotl_logo_digital_black.svg
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
+    <path fill="#141310" d="M435,234.3l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185.1h31.6l47.9,185.1h-24.5ZM417.7,164.9l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+    <path fill="#141310" d="M568.2,234.3l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
+    <path fill="#141310" d="M658.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM658.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#141310" d="M860.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM860.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#141310" d="M773.9,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
+    <path fill="#141310" d="M1036.2,234.3V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.8v-24.1h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
+    <path fill="#141310" d="M978.6,234.3c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3v-45.3h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+    <path fill="#141310" d="M51.5,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v32.8h20.6v-32.8c0-4.7,3.8-8.4,8.4-8.4Z"/>
+    <path fill="#141310" d="M92.8,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v12.2h20.6v-12.2c0-4.7,3.8-8.4,8.4-8.4Z"/>
+    <path fill="#141310" d="M249.3,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v32.8h20.6v-32.8Z"/>
+    <path fill="#141310" d="M187.4,90.2v-20.6h-103.1v20.6h-41.2v20.6h-20.6v41.2c0,11.4,9.2,20.6,20.6,20.6h185.5c11.4,0,20.6-9.2,20.6-20.6v-41.2h-20.6v-20.6h-41.2ZM166.8,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3ZM228.7,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3Z"/>
+    <path fill="#141310" d="M208,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v12.2h20.6v-12.2Z"/>
+    <rect fill="#141310" x="22.5" y="234.5" width="41.2" height="20.6"/>
+    <rect fill="#141310" x="84.3" y="234.5" width="164.9" height="20.6"/>
+    <rect fill="#141310" x="208" y="193.3" width="41.2" height="20.6"/>
+    <rect fill="#141310" x="22.5" y="193.3" width="164.9" height="20.6"/>
+</svg>
--- a/image/axolotl_logo_digital_white.svg
+++ b/image/axolotl_logo_digital_white.svg
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
+    <path fill="#fff" d="M462.9,234.2l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185h31.6l47.9,185h-24.4ZM445.7,164.8l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+    <path fill="#fff" d="M596.1,234.2l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.5-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.3,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.1,49.3,71.6h-28.5Z"/>
+    <path fill="#fff" d="M686.4,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM686.4,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#fff" d="M888.3,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM888.3,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#fff" d="M801.7,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.5c0,4.7,3.8,8.5,8.5,8.5h16.7v24.1h-16.7Z"/>
+    <path fill="#fff" d="M1063.8,234.2V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.7v-24.1h16.7c18,0,32.6,14.6,32.6,32.6v152.8h-24.1Z"/>
+    <path fill="#fff" d="M1006.2,234.2c-18,0-32.6-14.6-32.6-32.6v-85h-20.3v-22.1h20.3v-45.2h24.1v45.2h30.2v22.1h-30.2v85c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+    <path fill="#fff" d="M160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM277.3,57.4c0-23.8-19.3-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.7,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.7-6.3-14.1-14.1-14.1h-12.2c-6.5,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.3-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.2c0,11,5.2,20.8,13.2,27.2-7.3.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.7,6.3,14.1,14.1,14.1h41.2c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h164.9c7.7,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.8-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.2c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM77.8,255.1h-41.2v-20.6h41.2v20.6ZM36.5,213.9v-20.6h164.9v20.6H36.5ZM263.3,255.1H98.4v-20.6h164.9v20.6ZM263.3,213.9h-41.2v-20.6h41.2v20.6ZM263.3,90.2h-20.6v20.6h20.6v41.2c0,11.4-9.2,20.6-20.6,20.6H57.2c-11.4,0-20.6-9.2-20.6-20.6v-41.2h20.6v-20.6h-20.6v-32.8c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.2v-20.6h-20.6v-12.2c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.1v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v12.2h-20.6v20.6h41.2v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v32.8ZM201.4,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
+</svg>
--- a/image/axolotl_symbol_digital_black.svg
+++ b/image/axolotl_symbol_digital_black.svg
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #141310;
+      }
+    </style>
+  </defs>
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <g>
+        <path class="cls-1" d="M46.9,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v36.9h23.2v-36.9c0-5.2,4.2-9.5,9.5-9.5Z"/>
+        <path class="cls-1" d="M93.2,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v13.7h23.2v-13.7c0-5.2,4.2-9.5,9.5-9.5Z"/>
+        <path class="cls-1" d="M269.3,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v36.9h23.2v-36.9Z"/>
+        <path class="cls-1" d="M199.7,83.8v-23.2h-116v23.2h-46.4v23.2H14.2v46.4c0,12.8,10.4,23.2,23.2,23.2h208.7c12.8,0,23.2-10.4,23.2-23.2v-46.4h-23.2v-23.2h-46.4ZM176.5,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6ZM246.1,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6Z"/>
+        <path class="cls-1" d="M222.9,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v13.7h23.2v-13.7Z"/>
+        <rect class="cls-1" x="14.2" y="246.1" width="46.4" height="23.2"/>
+        <rect class="cls-1" x="83.8" y="246.1" width="185.5" height="23.2"/>
+        <rect class="cls-1" x="222.9" y="199.7" width="46.4" height="23.2"/>
+        <rect class="cls-1" x="14.2" y="199.7" width="185.5" height="23.2"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/image/axolotl_symbol_digital_white.svg
+++ b/image/axolotl_symbol_digital_white.svg
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #fff;
+      }
+    </style>
+  </defs>
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <path class="cls-1" d="M152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM269.3,57.3c0-23.8-19.4-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.8,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.8-6.3-14.1-14.1-14.1h-12.2c-6.6,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.4-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.3c0,11,5.2,20.9,13.2,27.2-7.4.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.8,6.3,14.1,14.1,14.1h41.3c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h165.1c7.8,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.9-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.3c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM69.5,255.2H28.2v-20.6h41.3v20.6ZM28.2,214v-20.6h165.1v20.6H28.2ZM255.2,255.2H90.1v-20.6h165.1v20.6ZM255.2,214h-41.3v-20.6h41.3v20.6ZM255.2,90.1h-20.6v20.6h20.6v41.3c0,11.4-9.2,20.6-20.6,20.6H48.9c-11.4,0-20.6-9.2-20.6-20.6v-41.3h20.6v-20.6h-20.6v-32.8c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.3v-20.6h-20.6v-12.2c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.2v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v12.2h-20.6v20.6h41.3v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v32.8ZM193.3,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
+    </g>
+  </g>
+</svg>
--- a/image/axolotl_wordmark_digital_black.svg
+++ b/image/axolotl_wordmark_digital_black.svg
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <g>
+        <path d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+        <path d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
+        <path d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
+        <path d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
+        <path d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/image/axolotl_wordmark_digital_white.svg
+++ b/image/axolotl_wordmark_digital_white.svg
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #fff;
+      }
+    </style>
+  </defs>
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <g>
+        <path class="cls-1" d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+        <path class="cls-1" d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
+        <path class="cls-1" d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path class="cls-1" d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path class="cls-1" d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
+        <path class="cls-1" d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
+        <path class="cls-1" d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,26 @@
+[build-system]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "axolotl"
+dynamic = ["version", "dependencies", "optional-dependencies"]
+description = "LLM Trainer"
+readme = "README.md"
+requires-python = ">=3.10"
+
+[project.scripts]
+axolotl = "axolotl.cli.main:main"
+
+[project.urls]
+Homepage = "https://axolotl-ai-cloud.github.io/axolotl/"
+Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"
+
+[tool.setuptools_scm]
+
+[tool.setuptools]
+py-modules = ["setuptools_axolotl_dynamic_dependencies"]
+include-package-data = true
+
+[tool.setuptools.cmdclass]
+build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,4 +2,3 @@ pre-commit
 black
 mypy
 types-requests
-tbparse
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,2 +1,5 @@
 pytest
 pytest-xdist
+pytest-retry
+pytest-sugar
+tbparse
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,30 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
+
+# START section of dependencies that don't install on Darwin/MacOS
+bitsandbytes==0.45.0
+triton>=2.3.0
+mamba-ssm==1.2.0.post1
+flash-attn==2.7.0.post2
+xformers>=0.0.23.post1
+autoawq==0.2.7.post3
+liger-kernel==0.4.2
+# END section
+
 packaging==23.2
-peft==0.13.2
-transformers==4.46.0
+peft==0.14.0
+transformers==4.47.0
 tokenizers>=0.20.1
-bitsandbytes==0.44.1
-accelerate==1.0.1
-datasets==3.0.1
-deepspeed==0.15.3
+accelerate==1.2.0
+datasets==3.1.0
+deepspeed==0.16.1
 pydantic==2.6.3
 addict
 fire
 PyYAML>=6.0
 requests
-flash-attn==2.6.3
 sentencepiece
 wandb
 einops
-xformers>=0.0.23.post1
 optimum==1.16.2
 hf_transfer
 colorama
@@ -26,24 +34,18 @@ numpy>=1.24.4,<=2.0.1
 evaluate==0.4.1
 scipy
 scikit-learn==1.4.2
-pynvml
+nvidia-ml-py==12.560.30
 art
-fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
 gradio==3.50.2
 tensorboard
 python-dotenv==1.0.1
-autoawq>=0.2.5
-triton>=2.3.0
-liger-kernel==0.3.0
-
-mamba-ssm==1.2.0.post1

 # remote filesystems
 s3fs>=2024.5.0
 gcsfs>=2024.5.0
 # adlfs

-trl @ git+https://github.com/huggingface/trl.git@31d02cfb795284591a084416b9dcb7bef5d08924
+trl==0.12.1
 zstandard==0.22.0
 fastcore

@@ -54,3 +56,4 @@ immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2

 torchao==0.5.0
+schedulefree==1.3.0
--- a/scripts/cloud-entrypoint.sh
+++ b/scripts/cloud-entrypoint.sh
@@ -2,7 +2,7 @@

 # Export specific ENV variables to /etc/rp_environment
 echo "Exporting environment variables..."
-printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
+printenv | grep -E '^HF_|^BNB_|^CUDA_|^NCCL_|^NV|^RUNPOD_|^PATH=|^_=' | sed 's/^\([^=]*\)=\(.*\)$/export \1="\2"/' | grep -v 'printenv' >> /etc/rp_environment
 echo 'source /etc/rp_environment' >> ~/.bashrc

 add_keys_to_authorized() {
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -0,0 +1,28 @@
+"""Script to output the correct installation command for cut-cross-entropy."""
+import importlib.util
+import sys
+
+try:
+    import torch
+except ImportError as exc:
+    raise ImportError("Install torch via `pip install torch`") from exc
+from packaging.version import Version as V
+
+v = V(torch.__version__)
+
+# no cut-cross-entropy support for torch < 2.4.0
+if v < V("2.4.0"):
+    print("")
+    sys.exit(0)
+
+cce_spec = importlib.util.find_spec("cut_cross_entropy")
+
+UNINSTALL_PREFIX = ""
+if cce_spec:
+    if not importlib.util.find_spec("cut_cross_entropy.transformers"):
+        UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "
+
+print(
+    UNINSTALL_PREFIX
+    + 'pip install "cut-cross-entropy @ git+https://github.com/apple/ml-cross-entropy.git@9c297c905f55b73594b5d650722d1e78183b77bd"'
+)
--- a/scripts/motd
+++ b/scripts/motd
@@ -13,5 +13,5 @@ cd /workspace
 rm -rf /workspace/axolotl
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip install --no-deps -e .
+pip install --no-build-isolation --no-deps -e .
 ```
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -0,0 +1,36 @@
+# noqa
+# pylint: skip-file
+try:
+    import torch
+except ImportError:
+    raise ImportError("Install torch via `pip install torch`")
+from packaging.version import Version as V
+
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+try:
+    is_ampere = torch.cuda.get_device_capability()[0] >= 8
+except RuntimeError:
+    is_ampere = False
+if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
+    raise RuntimeError(f"CUDA = {cuda} not supported!")
+if v <= V("2.1.0"):
+    raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V("2.1.1"):
+    x = "cu{}{}-torch211"
+elif v <= V("2.1.2"):
+    x = "cu{}{}-torch212"
+elif v < V("2.3.0"):
+    x = "cu{}{}-torch220"
+elif v < V("2.4.0"):
+    x = "cu{}{}-torch230"
+elif v < V("2.5.0"):
+    x = "cu{}{}-torch240"
+elif v < V("2.6.0"):
+    x = "cu{}{}-torch250"
+else:
+    raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(
+    f'pip install unsloth-zoo==2024.11.7 && pip install --no-deps "unsloth[{x}]==2024.11.9"'
+)
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,10 @@
 """setup.py for axolotl"""
-
+import ast
+import os
 import platform
 import re
 from importlib.metadata import PackageNotFoundError, version
+from pathlib import Path

 from setuptools import find_packages, setup

@@ -39,7 +41,10 @@ def parse_requirements():
        else:
            # detect the version of torch already installed
            # and set it so dependencies don't clobber the torch version
-            torch_version = version("torch")
+            try:
+                torch_version = version("torch")
+            except PackageNotFoundError:
+                torch_version = "2.5.1"
            _install_requires.append(f"torch=={torch_version}")

            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
@@ -54,6 +59,10 @@ def parse_requirements():

            if (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
+                if patch == 0:
+                    _install_requires.append("xformers==0.0.28.post2")
+                else:
+                    _install_requires.append("xformers==0.0.28.post3")
                _install_requires.pop(_install_requires.index(autoawq_version))
            elif (major, minor) >= (2, 4):
                if patch == 0:
@@ -84,27 +93,39 @@ def parse_requirements():
    return _install_requires, _dependency_links


+def get_package_version():
+    with open(
+        Path(os.path.dirname(os.path.abspath(__file__)))
+        / "src"
+        / "axolotl"
+        / "__init__.py",
+        "r",
+        encoding="utf-8",
+    ) as fin:
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", fin.read(), re.MULTILINE)
+    version_ = ast.literal_eval(version_match.group(1))
+    return version_
+
+
 install_requires, dependency_links = parse_requirements()

-
 setup(
-    name="axolotl",
-    version="0.4.1",
-    description="LLM Trainer",
-    long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
+    version=get_package_version(),
    package_dir={"": "src"},
-    packages=find_packages(),
+    packages=find_packages("src"),
    install_requires=install_requires,
    dependency_links=dependency_links,
+    entry_points={
+        "console_scripts": [
+            "axolotl=axolotl.cli.main:main",
+        ],
+    },
    extras_require={
        "flash-attn": [
-            "flash-attn==2.6.3",
-        ],
-        "fused-dense-lib": [
-            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.6.2#subdirectory=csrc/fused_dense_lib",
+            "flash-attn==2.7.0.post2",
        ],
        "deepspeed": [
-            "deepspeed==0.14.4",
+            "deepspeed==0.16.1",
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -0,0 +1,3 @@
+"""Axolotl - Train and fine-tune large language models"""
+
+__version__ = "0.6.0"
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -27,14 +27,17 @@ from transformers.utils import is_torch_bf16_gpu_available
 from transformers.utils.import_utils import _is_package_available

 from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
-from axolotl.integrations.base import PluginManager
 from axolotl.logging_config import configure_logging
 from axolotl.train import TrainDatasetMeta
-from axolotl.utils.chat_templates import get_chat_template
+from axolotl.utils.chat_templates import (
+    get_chat_template,
+    get_chat_template_from_config,
+)
 from axolotl.utils.comet_ import setup_comet_env_vars
 from axolotl.utils.config import (
    normalize_cfg_datasets,
    normalize_config,
+    prepare_plugins,
    validate_config,
 )
 from axolotl.utils.data import load_prepare_dpo_datasets, prepare_dataset
@@ -97,8 +100,8 @@ def print_dep_versions():
        print("*" * 40)
        print("**** Axolotl Dependency Versions *****")
        for pkg in packages:
-            version = _is_package_available(pkg, return_version=True)
-            print(f"{pkg: >{max_len}}: {version[1]: <15}")
+            pkg_version = _is_package_available(pkg, return_version=True)
+            print(f"{pkg: >{max_len}}: {pkg_version[1]: <15}")
        print("*" * 40)


@@ -136,7 +139,7 @@ def check_remote_config(config: Union[str, Path]):
        with open(output_path, "wb") as file:
            file.write(content)
        LOG.info(
-            f"Using the following config obtained from {config}:\n\n{content.decode('utf-8')}\n"
+            f"Using the following config obtained from {config}: \n\n{content.decode('utf-8')}\n"
        )
        return output_path

@@ -190,18 +193,19 @@ def do_inference(
 ):
    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
-    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-    for token, symbol in default_tokens.items():
-        # If the token isn't already specified in the config, add it
-        if not (cfg.special_tokens and token in cfg.special_tokens):
-            tokenizer.add_special_tokens({token: symbol})

    prompter_module = None
+    chat_template_str = None
    if prompter:
        prompter_module = getattr(
            importlib.import_module("axolotl.prompters"), prompter
        )
+    elif cfg.chat_template:
+        chat_template_str = get_chat_template(cfg.chat_template)
+    elif cfg.datasets[0].type == "chat_template":
+        chat_template_str = get_chat_template_from_config(
+            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
+        )

    model = model.to(cfg.device, dtype=cfg.torch_dtype)

@@ -211,13 +215,31 @@ def do_inference(
        instruction = get_multi_line_input()
        if not instruction:
            return
+
        if prompter_module:
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
        else:
            prompt = instruction.strip()
-        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+
+        if chat_template_str:
+            batch = tokenizer.apply_chat_template(
+                [
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+                return_tensors="pt",
+                add_special_tokens=True,
+                add_generation_prompt=True,
+                chat_template=chat_template_str,
+                tokenize=True,
+                return_dict=True,
+            )
+        else:
+            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

        print("=" * 40)
        model.eval()
@@ -257,13 +279,6 @@ def do_inference_gradio(

    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
-    # default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-    default_tokens: Dict[str, str] = {}
-
-    for token, symbol in default_tokens.items():
-        # If the token isn't already specified in the config, add it
-        if not (cfg.special_tokens and token in cfg.special_tokens):
-            tokenizer.add_special_tokens({token: symbol})

    prompter_module = None
    chat_template_str = None
@@ -365,7 +380,7 @@ def choose_config(path: Path):

    if len(yaml_files) == 1:
        print(f"Using default YAML file '{yaml_files[0]}'")
-        return yaml_files[0]
+        return str(yaml_files[0])

    print("Choose a YAML file:")
    for idx, file in enumerate(yaml_files):
@@ -376,7 +391,7 @@ def choose_config(path: Path):
        try:
            choice = int(input("Enter the number of your choice: "))
            if 1 <= choice <= len(yaml_files):
-                chosen_file = yaml_files[choice - 1]
+                chosen_file = str(yaml_files[choice - 1])
            else:
                print("Invalid choice. Please choose a number from the list.")
        except ValueError:
@@ -411,17 +426,14 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):

    cfg.axolotl_config_path = config

-    if cfg.get("plugins"):
-        plugin_manager = PluginManager.get_instance()
-        for plugin_name in cfg["plugins"]:
-            plugin_manager.register(plugin_name)
-
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
    except:  # pylint: disable=bare-except # noqa: E722
        gpu_version = None

+    prepare_plugins(cfg)
+
    cfg = validate_config(
        cfg,
        capabilities={
@@ -429,6 +441,9 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
            "n_gpu": int(os.environ.get("WORLD_SIZE", 1)),
            "compute_capability": gpu_version,
        },
+        env_capabilities={
+            "torch_version": str(torch.__version__).split("+", maxsplit=1)[0],
+        },
    )

    prepare_optim_env(cfg)
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -2,6 +2,7 @@
 CLI to run inference on a trained model
 """
 from pathlib import Path
+from typing import Union

 import fire
 import transformers
@@ -16,10 +17,10 @@ from axolotl.cli import (
 from axolotl.common.cli import TrainerCliArgs


-def do_cli(config: Path = Path("examples/"), gradio=False, **kwargs):
+def do_cli(config: Union[Path, str] = Path("examples/"), gradio=False, **kwargs):
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
-    parsed_cfg = load_cfg(config, **kwargs)
+    parsed_cfg = load_cfg(config, inference=True, **kwargs)
    parsed_cfg.sample_packing = False
    parser = transformers.HfArgumentParser((TrainerCliArgs))
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -0,0 +1,233 @@
+"""CLI definition for various axolotl commands."""
+# pylint: disable=redefined-outer-name
+import subprocess  # nosec B404
+from typing import Optional
+
+import click
+
+import axolotl
+from axolotl.cli.utils import (
+    add_options_from_config,
+    add_options_from_dataclass,
+    build_command,
+    fetch_from_github,
+)
+from axolotl.common.cli import PreprocessCliArgs, TrainerCliArgs
+from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
+
+
+@click.group()
+@click.version_option(version=axolotl.__version__, prog_name="axolotl")
+def cli():
+    """Axolotl CLI - Train and fine-tune large language models"""
+
+
+@cli.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@add_options_from_dataclass(PreprocessCliArgs)
+@add_options_from_config(AxolotlInputConfig)
+def preprocess(config: str, **kwargs):
+    """Preprocess datasets before training."""
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+    from axolotl.cli.preprocess import do_cli
+
+    do_cli(config=config, **kwargs)
+
+
+@cli.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@click.option(
+    "--accelerate/--no-accelerate",
+    default=True,
+    help="Use accelerate launch for multi-GPU training",
+)
+@add_options_from_dataclass(TrainerCliArgs)
+@add_options_from_config(AxolotlInputConfig)
+def train(config: str, accelerate: bool, **kwargs):
+    """Train or fine-tune a model."""
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+    if accelerate:
+        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
+        if config:
+            base_cmd.append(config)
+        cmd = build_command(base_cmd, kwargs)
+        subprocess.run(cmd, check=True)  # nosec B603
+    else:
+        from axolotl.cli.train import do_cli
+
+        do_cli(config=config, **kwargs)
+
+
+@cli.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@click.option(
+    "--accelerate/--no-accelerate",
+    default=True,
+    help="Use accelerate launch for multi-GPU inference",
+)
+@click.option(
+    "--lora-model-dir",
+    type=click.Path(exists=True, path_type=str),
+    help="Directory containing LoRA model",
+)
+@click.option(
+    "--base-model",
+    type=click.Path(exists=True, path_type=str),
+    help="Path to base model for non-LoRA models",
+)
+@click.option("--gradio", is_flag=True, help="Launch Gradio interface")
+@click.option("--load-in-8bit", is_flag=True, help="Load model in 8-bit mode")
+@add_options_from_dataclass(TrainerCliArgs)
+@add_options_from_config(AxolotlInputConfig)
+def inference(
+    config: str,
+    accelerate: bool,
+    lora_model_dir: Optional[str] = None,
+    base_model: Optional[str] = None,
+    **kwargs,
+):
+    """Run inference with a trained model."""
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    del kwargs["inference"]  # interferes with inference.do_cli
+
+    if lora_model_dir:
+        kwargs["lora_model_dir"] = lora_model_dir
+    if base_model:
+        kwargs["output_dir"] = base_model
+
+    if accelerate:
+        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.inference"]
+        if config:
+            base_cmd.append(config)
+        cmd = build_command(base_cmd, kwargs)
+        subprocess.run(cmd, check=True)  # nosec B603
+    else:
+        from axolotl.cli.inference import do_cli
+
+        do_cli(config=config, **kwargs)
+
+
+@cli.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@click.option(
+    "--accelerate/--no-accelerate",
+    default=False,
+    help="Use accelerate launch for multi-GPU operations",
+)
+@click.option(
+    "--model-dir",
+    type=click.Path(exists=True, path_type=str),
+    help="Directory containing model weights to shard",
+)
+@click.option(
+    "--save-dir",
+    type=click.Path(path_type=str),
+    help="Directory to save sharded weights",
+)
+@add_options_from_dataclass(TrainerCliArgs)
+@add_options_from_config(AxolotlInputConfig)
+def shard(config: str, accelerate: bool, **kwargs):
+    """Shard model weights."""
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+    if accelerate:
+        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.shard"]
+        if config:
+            base_cmd.append(config)
+        cmd = build_command(base_cmd, kwargs)
+        subprocess.run(cmd, check=True)  # nosec B603
+    else:
+        from axolotl.cli.shard import do_cli
+
+        do_cli(config=config, **kwargs)
+
+
+@cli.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@click.option(
+    "--accelerate/--no-accelerate",
+    default=True,
+    help="Use accelerate launch for weight merging",
+)
+@click.option(
+    "--model-dir",
+    type=click.Path(exists=True, path_type=str),
+    help="Directory containing sharded weights",
+)
+@click.option(
+    "--save-path", type=click.Path(path_type=str), help="Path to save merged weights"
+)
+@add_options_from_dataclass(TrainerCliArgs)
+@add_options_from_config(AxolotlInputConfig)
+def merge_sharded_fsdp_weights(config: str, accelerate: bool, **kwargs):
+    """Merge sharded FSDP model weights."""
+    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+
+    if accelerate:
+        base_cmd = [
+            "accelerate",
+            "launch",
+            "-m",
+            "axolotl.cli.merge_sharded_fsdp_weights",
+        ]
+        if config:
+            base_cmd.append(config)
+        cmd = build_command(base_cmd, kwargs)
+        subprocess.run(cmd, check=True)  # nosec B603
+    else:
+        from axolotl.cli.merge_sharded_fsdp_weights import do_cli
+
+        do_cli(config=config, **kwargs)
+
+
+@cli.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@click.option(
+    "--lora-model-dir",
+    type=click.Path(exists=True, path_type=str),
+    help="Directory containing the LoRA model to merge",
+)
+@click.option(
+    "--output-dir",
+    type=click.Path(path_type=str),
+    help="Directory to save the merged model",
+)
+def merge_lora(
+    config: str,
+    lora_model_dir: Optional[str] = None,
+    output_dir: Optional[str] = None,
+):
+    """Merge a trained LoRA into a base model"""
+    kwargs = {}
+    if lora_model_dir:
+        kwargs["lora_model_dir"] = lora_model_dir
+    if output_dir:
+        kwargs["output_dir"] = output_dir
+
+    from axolotl.cli.merge_lora import do_cli
+
+    do_cli(config=config, **kwargs)
+
+
+@cli.command()
+@click.argument("directory", type=click.Choice(["examples", "deepspeed_configs"]))
+@click.option("--dest", help="Destination directory")
+def fetch(directory: str, dest: Optional[str]):
+    """
+    Fetch example configs or other resources.
+
+    Available directories:
+    - examples: Example configuration files
+    - deepspeed_configs: DeepSpeed configuration files
+    """
+    fetch_from_github(f"{directory}/", dest)
+
+
+def main():
+    cli()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -2,6 +2,7 @@
 CLI to run merge a trained LoRA into a base model
 """
 from pathlib import Path
+from typing import Union

 import fire
 import transformers
@@ -11,7 +12,7 @@ from axolotl.cli import do_merge_lora, load_cfg, print_axolotl_text_art
 from axolotl.common.cli import TrainerCliArgs


-def do_cli(config: Path = Path("examples/"), **kwargs):
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    parser = transformers.HfArgumentParser((TrainerCliArgs))
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -177,7 +177,7 @@ def merge_fsdp_weights(
    state.wait_for_everyone()


-def do_cli(config: Path = Path("examples/"), **kwargs):
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    parser = transformers.HfArgumentParser((TrainerCliArgs))
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -23,10 +23,6 @@ from axolotl.cli import (
 )
 from axolotl.common.cli import PreprocessCliArgs
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
-from axolotl.prompt_strategies.sharegpt import (
-    register_chatml_template,
-    register_llama3_template,
-)
 from axolotl.utils.trainer import disable_datasets_caching

 LOG = logging.getLogger("axolotl.cli.preprocess")
@@ -44,23 +40,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        return_remaining_strings=True
    )

-    if parsed_cfg.chat_template == "chatml":
-        if parsed_cfg.default_system_message:
-            LOG.info(
-                f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
-            )
-            register_chatml_template(parsed_cfg.default_system_message)
-        else:
-            register_chatml_template()
-    elif parsed_cfg.chat_template == "llama3":
-        if parsed_cfg.default_system_message:
-            LOG.info(
-                f"LLaMA-3 set. Adding default system message: {parsed_cfg.default_system_message}"
-            )
-            register_llama3_template(parsed_cfg.default_system_message)
-        else:
-            register_llama3_template()
-
    if not parsed_cfg.dataset_prepared_path:
        msg = (
            Fore.RED
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -19,10 +19,6 @@ from axolotl.cli import (
 )
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.integrations.base import PluginManager
-from axolotl.prompt_strategies.sharegpt import (
-    register_chatml_template,
-    register_llama3_template,
-)
 from axolotl.train import train

 LOG = logging.getLogger("axolotl.cli.train")
@@ -42,21 +38,6 @@ def do_train(cfg, cli_args) -> None:
    print_axolotl_text_art()
    check_accelerate_default_config()
    check_user_token()
-    if cfg.chat_template == "chatml" and cfg.default_system_message:
-        LOG.info(
-            f"ChatML set. Adding default system message: {cfg.default_system_message}"
-        )
-        register_chatml_template(cfg.default_system_message)
-    else:
-        register_chatml_template()
-
-    if cfg.chat_template == "llama3" and cfg.default_system_message:
-        LOG.info(
-            f"LLaMA-3 set. Adding default system message: {cfg.default_system_message}"
-        )
-        register_llama3_template(cfg.default_system_message)
-    else:
-        register_llama3_template()

    if cfg.rl:  # and cfg.rl != "orpo":
        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -0,0 +1,218 @@
+"""Utility methods for axoltl CLI."""
+import concurrent.futures
+import dataclasses
+import hashlib
+import json
+import logging
+from pathlib import Path
+from types import NoneType
+from typing import Any, Dict, List, Optional, Tuple, Type, Union, get_args, get_origin
+
+import click
+import requests
+from pydantic import BaseModel
+
+LOG = logging.getLogger("axolotl.cli.utils")
+
+
+def add_options_from_dataclass(config_class: Type[Any]):
+    """Create Click options from the fields of a dataclass."""
+
+    def decorator(function):
+        # Process dataclass fields in reverse order for correct option ordering
+        for field in reversed(dataclasses.fields(config_class)):
+            field_type = field.type
+
+            if get_origin(field_type) is Union and type(None) in get_args(field_type):
+                field_type = next(
+                    t for t in get_args(field_type) if not isinstance(t, NoneType)
+                )
+
+            if field_type == bool:
+                field_name = field.name.replace("_", "-")
+                option_name = f"--{field_name}/--no-{field_name}"
+                function = click.option(
+                    option_name,
+                    default=field.default,
+                    help=field.metadata.get("description"),
+                )(function)
+            else:
+                option_name = f"--{field.name.replace('_', '-')}"
+                function = click.option(
+                    option_name,
+                    type=field_type,
+                    default=field.default,
+                    help=field.metadata.get("description"),
+                )(function)
+        return function
+
+    return decorator
+
+
+def add_options_from_config(config_class: Type[BaseModel]):
+    """Create Click options from the fields of a Pydantic model."""
+
+    def decorator(function):
+        # Process model fields in reverse order for correct option ordering
+        for name, field in reversed(config_class.model_fields.items()):
+            if field.annotation == bool:
+                field_name = name.replace("_", "-")
+                option_name = f"--{field_name}/--no-{field_name}"
+                function = click.option(
+                    option_name, default=None, help=field.description
+                )(function)
+            else:
+                option_name = f"--{name.replace('_', '-')}"
+                function = click.option(
+                    option_name, default=None, help=field.description
+                )(function)
+        return function
+
+    return decorator
+
+
+def build_command(base_cmd: List[str], options: Dict[str, Any]) -> List[str]:
+    """Build command list from base command and options."""
+    cmd = base_cmd.copy()
+
+    for key, value in options.items():
+        if value is None:
+            continue
+
+        key = key.replace("_", "-")
+
+        if isinstance(value, bool):
+            if value:
+                cmd.append(f"--{key}")
+        else:
+            cmd.extend([f"--{key}", str(value)])
+
+    return cmd
+
+
+def download_file(
+    file_info: tuple, raw_base_url: str, dest_path: Path, dir_prefix: str
+) -> Tuple[str, str]:
+    """
+    Download a single file and return its processing status.
+
+    Args:
+        file_info: Tuple of (file_path, remote_sha)
+        raw_base_url: Base URL for raw GitHub content
+        dest_path: Local destination directory
+        dir_prefix: Directory prefix to filter files
+
+    Returns:
+        Tuple of (file_path, status) where status is 'new', 'updated', or 'unchanged'
+    """
+    file_path, remote_sha = file_info
+    raw_url = f"{raw_base_url}/{file_path}"
+    dest_file = dest_path / file_path.split(dir_prefix)[-1]
+
+    # Check if file exists and needs updating
+    if dest_file.exists():
+        with open(dest_file, "rb") as file:
+            content = file.read()
+            # Calculate git blob SHA
+            blob = b"blob " + str(len(content)).encode() + b"\0" + content
+            local_sha = hashlib.sha1(blob, usedforsecurity=False).hexdigest()
+
+        if local_sha == remote_sha:
+            print(f"Skipping {file_path} (unchanged)")
+            return file_path, "unchanged"
+
+        print(f"Updating {file_path}")
+        status = "new"
+    else:
+        print(f"Downloading {file_path}")
+        status = "new"
+
+    # Create directories if needed
+    dest_file.parent.mkdir(parents=True, exist_ok=True)
+
+    # Download and save file
+    try:
+        response = requests.get(raw_url, timeout=30)
+        response.raise_for_status()
+
+        with open(dest_file, "wb") as file:
+            file.write(response.content)
+
+        return file_path, status
+    except (requests.RequestException, IOError) as request_error:
+        print(f"Error downloading {file_path}: {str(request_error)}")
+        return file_path, "error"
+
+
+def fetch_from_github(
+    dir_prefix: str, dest_dir: Optional[str] = None, max_workers: int = 5
+) -> None:
+    """
+    Sync files from a specific directory in the GitHub repository.
+    Only downloads files that don't exist locally or have changed.
+
+    Args:
+        dir_prefix: Directory prefix to filter files (e.g., 'examples/', 'deepspeed_configs/')
+        dest_dir: Local destination directory
+        max_workers: Maximum number of concurrent downloads
+    """
+    api_url = "https://api.github.com/repos/axolotl-ai-cloud/axolotl/git/trees/main?recursive=1"
+    raw_base_url = "https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main"
+
+    # Get repository tree with timeout
+    response = requests.get(api_url, timeout=30)
+    response.raise_for_status()
+    tree = json.loads(response.text)
+
+    # Filter for files and get their SHA
+    files = {
+        item["path"]: item["sha"]
+        for item in tree["tree"]
+        if item["type"] == "blob" and item["path"].startswith(dir_prefix)
+    }
+
+    if not files:
+        raise click.ClickException(f"No files found in {dir_prefix}")
+
+    # Default destination directory is the last part of dir_prefix
+    default_dest = Path(dir_prefix.rstrip("/"))
+    dest_path = Path(dest_dir) if dest_dir else default_dest
+
+    # Keep track of processed files for summary
+    files_processed: Dict[str, List[str]] = {
+        "new": [],
+        "updated": [],
+        "unchanged": [],
+        "error": [],
+    }
+
+    # Process files in parallel using ThreadPoolExecutor
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_file = {
+            executor.submit(
+                download_file,
+                (file_path, remote_sha),
+                raw_base_url,
+                dest_path,
+                dir_prefix,
+            ): file_path
+            for file_path, remote_sha in files.items()
+        }
+
+        # Process completed tasks as they finish
+        for future in concurrent.futures.as_completed(future_to_file):
+            file_path = future_to_file[future]
+            try:
+                file_path, status = future.result()
+                files_processed[status].append(file_path)
+            except (requests.RequestException, IOError) as request_error:
+                print(f"Error processing {file_path}: {str(request_error)}")
+                files_processed["error"].append(file_path)
+
+    # Log summary
+    LOG.info("\nSync Summary:")
+    LOG.info(f"New files: {len(files_processed['new'])}")
+    LOG.info(f"Updated files: {len(files_processed['updated'])}")
+    LOG.info(f"Unchanged files: {len(files_processed['unchanged'])}")
+    if files_processed["error"]:
+        LOG.info(f"Failed files: {len(files_processed['error'])}")
--- a/src/axolotl/core/tokenizer_utils.py
+++ b/src/axolotl/core/tokenizer_utils.py
@@ -3,36 +3,88 @@ helper functions for fixing the embeddings/tokenizer
 """

 # Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
+# GNU LESSER GENERAL PUBLIC LICENSE
+# Version 3, 29 June 2007
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+# Everyone is permitted to copy and distribute verbatim copies
+# of this license document, but changing it is not allowed.

 import gc
 import itertools
+import logging
+from collections import Counter

+import datasets
 import numpy as np
 import torch

+LOG = logging.getLogger("axolotl.core.tokenizer_utils")

-@torch.inference_mode
-def fix_untrained_tokens(model, tokenizer, train_dataset, eps=1e-16):
+
+@torch.inference_mode()
+def fix_untrained_tokens(  # pylint: disable=too-many-return-statements
+    model, tokenizer, train_dataset, ignored_tokenizer_names=None, eps=1e-16
+):
    """
-    Many of the newer models have reserved tokens that are not trained.
+    Llama-3 for eg has untrained vectors in the base model.
+    These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
+    We reset them to the mean of the rest of the tokens
    """
+    # Code licensed under LGPL
    embedding_matrix = model.get_input_embeddings().weight
    lm_head_matrix = model.get_output_embeddings().weight
+    chat_template = getattr(tokenizer, "chat_template", None)
+    tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
+
+    # Ignore some model checks for now
+    if not ignored_tokenizer_names:
+        ignored_tokenizer_names = []
+    if (
+        model.config._name_or_path  # pylint: disable=protected-access
+        in ignored_tokenizer_names
+    ):
+        return
+
+    # Sometimes the sizes can be different like in vision models
+    # Ie <image> is in input, but not in output
+    min_size = min(embedding_matrix.shape[1], lm_head_matrix.shape[1])
+    embedding_matrix = embedding_matrix[:, :min_size]
+    lm_head_matrix = lm_head_matrix[:, :min_size]

    # Get untrained tokens
-    indicator_untrained = torch.amax(embedding_matrix, axis=1) <= eps
+    indicator_untrained1 = torch.amax(embedding_matrix, axis=1) <= eps
+    # Check lm_head as well
+
+    # Does NOT work for Llama 3.1!!
+    indicator_untrained2 = torch.amax(lm_head_matrix, axis=1) <= eps
+
+    # We instead check for repeated vectors
+    lm_head_where = torch.where(indicator_untrained1)[0]
+    lm_head_bad = lm_head_matrix[lm_head_where]
+    lm_head_bad = lm_head_bad.cpu().float().numpy().round(3)
+    counter = Counter()
+    for row in lm_head_bad:
+        counter[hash(row.data.tobytes())] += 1
+    counter = Counter({k: c for k, c in counter.items() if c >= 2})
+
+    lm_head_where = lm_head_where.cpu().numpy()
+    final_bad_lm_head = []
+    for j, row in enumerate(lm_head_bad):
+        if hash(row.data.tobytes()) in counter:
+            final_bad_lm_head.append(lm_head_where[j])
+    indicator_untrained2 = indicator_untrained2 | torch.zeros_like(indicator_untrained2)
+    indicator_untrained2[final_bad_lm_head] = True
+
+    # Combine both checks
+    indicator_untrained = indicator_untrained1 & indicator_untrained2
+
+    # Remove pad token possibility
+    if hasattr(tokenizer, "pad_token_id"):
+        pad_token_id = tokenizer.pad_token_id
+        if pad_token_id is not None and pad_token_id < indicator_untrained.shape[0]:
+            indicator_untrained[pad_token_id] = False
+
    where_untrained = torch.where(indicator_untrained)[0]
    n_untrained = where_untrained.shape[0]
    n_trained = embedding_matrix.shape[0] - n_untrained
@@ -40,10 +92,9 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps=1e-16):
    # Get set and actual tokens
    where_untrained = where_untrained.tolist()
    if len(where_untrained) == 0:
-        return False
+        return

    # Remove untrained indices where it's longer
-
    where_untrained_set = frozenset(where_untrained)
    actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
    # Remove None items in actual_bad_tokens
@@ -53,10 +104,14 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps=1e-16):
    if_bad_first = False
    if_bad_second = False
    # Check tokenizer's chat template for any untrained tokens
-    chat_template = getattr(tokenizer, "chat_template", None)
    if chat_template is not None:
        if_bad_first = any(x in chat_template for x in actual_bad_tokens)

+    if isinstance(train_dataset, datasets.IterableDataset):
+        # Skip the check, since the code below assumes
+        # an indexable dataset
+        return
+
    # Check the first 250, last 250 input_ids
    size_dataset = len(train_dataset)
    size = min(size_dataset, 250)
@@ -83,7 +138,69 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps=1e-16):

    # Check if bad tokens exists!
    if not if_bad_first and not if_bad_second:
-        return False
+        return
+
+    # Check if lm_head / embed_token are trainable!
+    bad_not_trainable = False
+    if not embedding_matrix.requires_grad:
+        bad_not_trainable = True
+    if not lm_head_matrix.requires_grad:
+        bad_not_trainable = True
+
+    if bad_not_trainable:  # pylint: disable=too-many-nested-blocks
+        final_bad_items = []
+
+        # Re-check the first 250, last 250 input_ids
+        size_dataset = len(train_dataset)
+        size = min(size_dataset, 250)
+        for j in range(size):
+            input_ids = train_dataset[j]
+            if "input_ids" in input_ids:
+                input_ids = input_ids["input_ids"]
+                for item in input_ids:
+                    if item in where_untrained_set:
+                        final_bad_items.append(item)
+
+        # Re-check last 250
+        left = max(size_dataset - 250, 0)
+        for j in range(left, size_dataset):
+            input_ids = train_dataset[j]
+            if "input_ids" in input_ids:
+                input_ids = input_ids["input_ids"]
+                for item in input_ids:
+                    if item in where_untrained_set:
+                        final_bad_items.append(item)
+
+        # If no bad tokens, possibly chat template itself has issues?
+        if len(final_bad_items) == 0:
+            # Recheck 2000 and last 2000 items
+            size_dataset = len(train_dataset)
+            size = min(size_dataset, 2000)
+            for j in range(size):
+                input_ids = train_dataset[j]
+                if "input_ids" in input_ids:
+                    input_ids = input_ids["input_ids"]
+                    for item in input_ids:
+                        if item in where_untrained_set:
+                            final_bad_items.append(item)
+
+            # Re-check last 2000
+            left = max(size_dataset - 2000, 0)
+            for j in range(left, size_dataset):
+                input_ids = train_dataset[j]
+                if "input_ids" in input_ids:
+                    input_ids = input_ids["input_ids"]
+                    for item in input_ids:
+                        if item in where_untrained_set:
+                            final_bad_items.append(item)
+
+            # Most likely false signal!
+            if len(final_bad_items) == 0:
+                return
+
+        raise ValueError(
+            f"Untrained tokens of [{list(set(final_bad_items))}] found, but embed_tokens & lm_head not trainable, causing NaNs. "
+        )

    # Count all the possible bad tokens
    final_counts = np.zeros(
@@ -97,6 +214,23 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps=1e-16):

    train_dataset.map(mapping, batched=True, desc="Counting untrained tokens")

+    # Get counts for untrained tokens
+    counts_untrained = final_counts[where_untrained]
+    # Identify untrained tokens seen in train_dataset
+    indices_seen_in_train = np.where(counts_untrained > 0)[0]
+    tokens_to_update = [where_untrained[i] for i in indices_seen_in_train]
+
+    if len(tokens_to_update) == 0:
+        LOG.info(
+            "No untrained tokens found in train_dataset. No embeddings were modified."
+        )
+        return
+
+    # Log the token IDs that are being rescaled
+    LOG.info(
+        f"Rescaling embeddings for tokens seen in train_dataset: {tokens_to_update}"
+    )
+
    # Get sum of all items
    sum_embedding = torch.sum(embedding_matrix, dtype=torch.float32, axis=0)
    sum_lm_head = torch.sum(lm_head_matrix, dtype=torch.float32, axis=0)
@@ -113,38 +247,26 @@ def fix_untrained_tokens(model, tokenizer, train_dataset, eps=1e-16):
    mean_embedding = sum_embedding / n_trained
    mean_lm_head = sum_lm_head / n_trained

-    # Scale each to be equal to 1/max_frequency. Also set some to 0 if none seen
-    scaling = final_counts[where_untrained] / max(final_counts.max(), 1)
+    # Compute scaling for tokens to update
+    scaling = counts_untrained[indices_seen_in_train] / max(final_counts.max(), 1)
    scaling = torch.tensor(scaling, device=mean_embedding.device).unsqueeze(1)
-    mean_embedding = (
-        mean_embedding.repeat(
-            (
-                n_untrained,
-                1,
-            )
-        )
-        * scaling
-    )
-    mean_lm_head = (
-        mean_lm_head.repeat(
-            (
-                n_untrained,
-                1,
-            )
-        )
-        * scaling
-    )
-    where_null = scaling.ravel() == 0
-    mean_embedding[where_null] = 0
-    mean_lm_head[where_null] = 0

-    # Set them to the mean
-    embedding_matrix[where_untrained] = mean_embedding.to(embedding_matrix.dtype)
-    lm_head_matrix[where_untrained] = mean_lm_head.to(lm_head_matrix.dtype)
+    # Prepare mean embeddings for tokens to update
+    mean_embedding_repeated = (
+        mean_embedding.unsqueeze(0).repeat(len(tokens_to_update), 1) * scaling
+    )
+    mean_lm_head_repeated = (
+        mean_lm_head.unsqueeze(0).repeat(len(tokens_to_update), 1) * scaling
+    )
+
+    # Update embeddings only for tokens seen in train_dataset
+    embedding_matrix[tokens_to_update] = mean_embedding_repeated.to(
+        embedding_matrix.dtype
+    )
+    lm_head_matrix[tokens_to_update] = mean_lm_head_repeated.to(lm_head_matrix.dtype)

    # Clean up
    for _ in range(3):
        gc.collect()
        torch.cuda.empty_cache()
-
-    return True
+    return
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
--- a/src/axolotl/core/trainers/trl.py
+++ b/src/axolotl/core/trainers/trl.py
@@ -40,7 +40,7 @@ class TRLPPOTrainer(PPOTrainer):
                query_tensors,
                return_prompt=False,
                generate_ref_response=True,
-                **generation_kwargs
+                **generation_kwargs,
            )
            batch["response"] = self.tokenizer.batch_decode(response_tensors)
            batch["ref_response"] = self.tokenizer.batch_decode(ref_response_tensors)
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -0,0 +1,220 @@
+"""
+extra axolotl specific training args
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import TrainingArguments
+from trl import CPOConfig, DPOConfig, KTOConfig, ORPOConfig, RewardConfig
+
+
+@dataclass
+class AxolotlTrainingMixins:
+    """
+    Mixin class for the Axolotl training args.
+    """
+
+    # pylint: disable=duplicate-code
+    model_type: Optional[str] = field(
+        default=None, metadata={"help": "HF model configuration model_type."}
+    )
+    lr_quadratic_warmup: bool = field(
+        default=False,
+        metadata={"help": "Use quadratic warmup for cosine scheduling."},
+    )
+    pretraining: bool = field(
+        default=False,
+        metadata={
+            "help": "Indicates to trainer whether we are doing continued pretraining."
+        },
+    )
+    sample_packing: bool = field(
+        default=False,
+        metadata={"help": "Use sample packing for efficient training."},
+    )
+    multipack_real_batches: bool = field(
+        default=False,
+        metadata={"help": "Use real batches for efficient training."},
+    )
+    eval_sample_packing: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Use sample packing for efficient evals."},
+    )
+    sample_packing_efficiency: float = field(
+        default=1.0,
+        metadata={"help": "Sample packing efficiency for calculating batch length."},
+    )
+    sample_packing_bin_size: int = field(
+        default=200,
+        metadata={
+            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
+        },
+    )
+    sample_packing_group_size: int = field(
+        default=100000,
+        metadata={
+            "help": "The number of samples to group together for packing. Increase for better packing."
+        },
+    )
+    max_seq_length: int = field(
+        default=2048,
+        metadata={"help": "The maximum sequence length the model can handle"},
+    )
+    relora_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to reset for ReLoRA"},
+    )
+    relora_warmup_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
+    )
+    relora_anneal_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
+    )
+    relora_prune_ratio: Optional[float] = field(
+        default=0.9,
+        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
+    )
+    bench_split: Optional[str] = field(
+        default="eval", metadata={"help": "The benchmark split to run on"}
+    )
+    bench_dataset: Optional[str] = field(
+        default="pharaouk/dharma-1/dharma_1_mini.json",
+        metadata={
+            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
+        },
+    )
+    do_bench_eval: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
+    )
+    do_causal_lm_eval: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
+    )
+    max_bench_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
+        },
+    )
+    bench_source_max_len: int = field(
+        default=2048, metadata={"help": "Maximum source sequence length for bench."}
+    )
+    dataloader_prefetch_factor: Optional[int] = field(
+        default=None,
+        metadata={"help": "prefetch_factor argument to the dataloader"},
+    )
+    cosine_min_lr_ratio: Optional[float] = field(
+        default=None,
+        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
+    )
+    cosine_constant_lr_ratio: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
+        },
+    )
+    loraplus_lr_ratio: Optional[float] = field(
+        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
+    )
+    loraplus_lr_embedding: Optional[float] = field(
+        default=1e-6,
+        metadata={"help": "loraplus learning rate for lora embedding layers."},
+    )
+    embedding_lr_scale: Optional[float] = field(
+        default=None,
+        metadata={"help": "Scale the learning rate for the embedding layers."},
+    )
+    embedding_lr: Optional[float] = field(
+        default=None,
+        metadata={"help": "absolute learning rate for the embedding layers."},
+    )
+    qlora: bool = field(
+        default=False,
+        metadata={"help": "whether this is a qlora training"},
+    )
+    orpo_alpha: Optional[float] = field(
+        default=None,
+    )
+    lisa_n_layers: Optional[int] = field(
+        default=None,
+        metadata={"help": "the number of activate layers in LISA"},
+    )
+    lisa_step_interval: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to switch layers in LISA"},
+    )
+    lisa_layers_attribute: Optional[str] = field(
+        default=None,
+        metadata={"help": "path under the model to access the layers"},
+    )
+    curriculum_sampling: Optional[bool] = field(
+        default=None,
+        metadata={"help": "whether to use sequential sampling for curriculum learning"},
+    )
+    alternate_optimizer: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "workaround to pass an alternate optimizer to the HF trainer"
+        },
+    )
+    alternate_lr_scheduler_type: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
+        },
+    )
+    chat_template: Optional[str] = field(
+        default=None,
+        metadata={"help": "Chat template converting chat messages to text"},
+    )
+
+
+@dataclass
+class AxolotlTrainingArguments(AxolotlTrainingMixins, TrainingArguments):
+    """
+    Training arguments for Causal trainer
+
+    This code is duplicated due to HF TrainingArguments not setting output_dir with a defaujlt value
+    so it can't be used as a mixin.
+    """
+
+
+@dataclass
+class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig):
+    """
+    DPO config for DPO training
+    """
+
+
+@dataclass
+class AxolotlORPOConfig(AxolotlTrainingMixins, ORPOConfig):
+    """
+    ORPO config for ORPO training
+    """
+
+
+@dataclass
+class AxolotlKTOConfig(AxolotlTrainingMixins, KTOConfig):
+    """
+    KTO config for KTO training
+    """
+
+
+@dataclass
+class AxolotlCPOConfig(AxolotlTrainingMixins, CPOConfig):
+    """
+    CPO config for CPO training
+    """
+
+    simpo_gamma: Optional[float] = field(
+        default=None,
+        metadata={"help": "simpo gamma parameter"},
+    )
+
+
+@dataclass
+class AxolotlRewardConfig(AxolotlTrainingMixins, RewardConfig):
+    """
+    Reward config for Reward training
+    """
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -18,9 +18,10 @@ Plugins can be used to integrate third-party models, modify the training process

 To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
 """
+import collections
 import importlib
 import logging
-from typing import List
+from typing import OrderedDict


 class BasePlugin:
@@ -47,7 +48,7 @@ class BasePlugin:
        Initializes the BasePlugin.
        """

-    def register(self, cfg):
+    def register(self, cfg):  # pylint: disable=unused-argument
        """
        Registers the plugin with the given configuration.

@@ -63,7 +64,7 @@ class BasePlugin:
        Returns a pydantic model for the plugin's input arguments.
        """

-    def pre_model_load(self, cfg):
+    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
        """
        Performs actions before the model is loaded.

@@ -74,7 +75,7 @@ class BasePlugin:
        None
        """

-    def post_model_load(self, cfg, model):
+    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after the model is loaded.

@@ -86,7 +87,7 @@ class BasePlugin:
        None
        """

-    def pre_lora_load(self, cfg, model):
+    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions before LoRA weights are loaded.

@@ -98,7 +99,7 @@ class BasePlugin:
        None
        """

-    def post_lora_load(self, cfg, model):
+    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after LoRA weights are loaded.

@@ -110,7 +111,7 @@ class BasePlugin:
        None
        """

-    def create_optimizer(self, cfg, trainer):
+    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
        """
        Creates and returns an optimizer for training.

@@ -122,7 +123,9 @@ class BasePlugin:
        object: The created optimizer.
        """

-    def create_lr_scheduler(self, cfg, trainer, optimizer):
+    def create_lr_scheduler(
+        self, cfg, trainer, optimizer
+    ):  # pylint: disable=unused-argument
        """
        Creates and returns a learning rate scheduler.

@@ -135,9 +138,9 @@ class BasePlugin:
        object: The created learning rate scheduler.
        """

-    def add_callbacks_pre_trainer(self, cfg, model):
+    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
        """
-        Adds callbacks to the trainer before training.
+        setup callbacks before creating the trainer.

        Parameters:
        cfg (dict): The configuration for the plugin.
@@ -146,20 +149,25 @@ class BasePlugin:
        Returns:
        List[callable]: A list of callback functions to be added to the TrainingArgs
        """
+        return []

-    def add_callbacks_post_trainer(self, cfg, trainer):
+    def add_callbacks_post_trainer(
+        self, cfg, trainer
+    ):  # pylint: disable=unused-argument
        """
-        Adds callbacks to the trainer after training.
+        Adds callbacks to the trainer after creating the trainer.
+        This is useful for callbacks that require access to the model or trainer.

        Parameters:
        cfg (dict): The configuration for the plugin.
        trainer (object): The trainer object for training.

        Returns:
-        List[callable]: A list of callback functions to be added to the TrainingArgs
+        List[callable]: A list of callback functions to be added
        """
+        return []

-    def post_train(self, cfg, model):
+    def post_train(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after training is complete.

@@ -171,7 +179,7 @@ class BasePlugin:
        None
        """

-    def post_train_unload(self, cfg):
+    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
        """
        Performs actions after training is complete and the model is unloaded.

@@ -227,7 +235,7 @@ class PluginManager:
    pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
    """

-    plugins: List[BasePlugin] = []
+    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()

    _instance = None

@@ -237,7 +245,7 @@ class PluginManager:
        """
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
-            cls._instance.plugins: List[BasePlugin] = []
+            cls._instance.plugins = collections.OrderedDict()
        return cls._instance

    @staticmethod
@@ -265,7 +273,7 @@ class PluginManager:
        """
        try:
            plugin = load_plugin(plugin_name)
-            self.plugins.append(plugin)
+            self.plugins[plugin_name] = plugin
        except ImportError:
            logging.error(f"Failed to load plugin: {plugin_name}")

@@ -277,7 +285,7 @@ class PluginManager:
        list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
        """
        input_args = []
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            input_args_from_plugin = plugin.get_input_args()
            if input_args_from_plugin is not None:
                input_args.append(input_args_from_plugin)
@@ -293,7 +301,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)

    def post_model_load(self, cfg, model):
@@ -307,7 +315,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            plugin.post_model_load(cfg, model)

    def pre_lora_load(self, cfg, model):
@@ -321,7 +329,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            plugin.pre_lora_load(cfg, model)

    def post_lora_load(self, cfg, model):
@@ -335,7 +343,7 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            plugin.post_lora_load(cfg, model)

    def create_optimizer(self, cfg, trainer):
@@ -349,7 +357,7 @@ class PluginManager:
        Returns:
        object: The created optimizer, or None if none was found.
        """
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            optimizer = plugin.create_optimizer(cfg, trainer)
            if optimizer is not None:
                return optimizer
@@ -367,7 +375,7 @@ class PluginManager:
        Returns:
        object: The created learning rate scheduler, or None if none was found.
        """
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            scheduler = plugin.create_lr_scheduler(cfg, trainer, optimizer)
            if scheduler is not None:
                return scheduler
@@ -385,8 +393,10 @@ class PluginManager:
        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
-        for plugin in self.plugins:
-            callbacks.extend(plugin.add_callbacks_pre_trainer(cfg, model))
+        for plugin in self.plugins.values():
+            plugin_callbacks = plugin.add_callbacks_pre_trainer(cfg, model)
+            if plugin_callbacks:  # if the plugin returned a list of callbacks
+                callbacks.extend(plugin_callbacks)
        return callbacks

    def add_callbacks_post_trainer(self, cfg, trainer):
@@ -401,8 +411,10 @@ class PluginManager:
        List[callable]: A list of callback functions to be added to the TrainingArgs.
        """
        callbacks = []
-        for plugin in self.plugins:
-            callbacks.extend(plugin.add_callbacks_post_trainer(cfg, trainer))
+        for plugin in self.plugins.values():
+            plugin_callbacks = plugin.add_callbacks_post_trainer(cfg, trainer)
+            if plugin_callbacks:
+                callbacks.extend(plugin_callbacks)
        return callbacks

    def post_train_unload(self, cfg):
@@ -416,5 +428,5 @@ class PluginManager:
        Returns:
        None
        """
-        for plugin in self.plugins:
+        for plugin in self.plugins.values():
            plugin.post_train_unload(cfg)
--- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.md
+++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.md
@@ -0,0 +1,325 @@
+Acknowledgements
+
+Portions of this Cut Cross Entropy Software may utilize the following copyrighted
+material, the use of which is hereby acknowledged.
+
+
+------
+
+
+PyTorch
+
+    From PyTorch:
+
+    Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
+    Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
+    Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
+    Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
+    Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
+    Copyright (c) 2011-2013 NYU                      (Clement Farabet)
+    Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
+    Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
+    Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
+
+    From Caffe2:
+
+    Copyright (c) 2016-present, Facebook Inc. All rights reserved.
+
+    All contributions by Facebook:
+    Copyright (c) 2016 Facebook Inc.
+
+    All contributions by Google:
+    Copyright (c) 2015 Google Inc.
+    All rights reserved.
+
+    All contributions by Yangqing Jia:
+    Copyright (c) 2015 Yangqing Jia
+    All rights reserved.
+
+    All contributions by Kakao Brain:
+    Copyright 2019-2020 Kakao Brain
+
+    All contributions by Cruise LLC:
+    Copyright (c) 2022 Cruise LLC.
+    All rights reserved.
+
+    All contributions by Arm:
+    Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates
+
+    All contributions from Caffe:
+    Copyright(c) 2013, 2014, 2015, the respective contributors
+    All rights reserved.
+
+    All other contributions:
+    Copyright(c) 2015, 2016 the respective contributors
+    All rights reserved.
+
+    Caffe2 uses a copyright model similar to Caffe: each contributor holds
+    copyright over their contributions to Caffe2. The project versioning records
+    all such contribution and copyright details. If a contributor wants to further
+    mark their specific copyright on a particular contribution, they should
+    indicate their copyright solely in the commit message of the change when it is
+    committed.
+
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+
+    3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
+    and IDIAP Research Institute nor the names of its contributors may be
+    used to endorse or promote products derived from this software without
+    specific prior written permission.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+
+Triton
+
+    /*
+    * Copyright 2018-2020 Philippe Tillet
+    * Copyright 2020-2022 OpenAI
+    *
+    * Permission is hereby granted, free of charge, to any person obtaining
+    * a copy of this software and associated documentation files
+    * (the "Software"), to deal in the Software without restriction,
+    * including without limitation the rights to use, copy, modify, merge,
+    * publish, distribute, sublicense, and/or sell copies of the Software,
+    * and to permit persons to whom the Software is furnished to do so,
+    * subject to the following conditions:
+    *
+    * The above copyright notice and this permission notice shall be
+    * included in all copies or substantial portions of the Software.
+    *
+    * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+    */
+
+
+Transformers
+
+    Copyright 2018- The Hugging Face team. All rights reserved.
+
+                                    Apache License
+                            Version 2.0, January 2004
+                            http://www.apache.org/licenses/
+
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+    1. Definitions.
+
+        "License" shall mean the terms and conditions for use, reproduction,
+        and distribution as defined by Sections 1 through 9 of this document.
+
+        "Licensor" shall mean the copyright owner or entity authorized by
+        the copyright owner that is granting the License.
+
+        "Legal Entity" shall mean the union of the acting entity and all
+        other entities that control, are controlled by, or are under common
+        control with that entity. For the purposes of this definition,
+        "control" means (i) the power, direct or indirect, to cause the
+        direction or management of such entity, whether by contract or
+        otherwise, or (ii) ownership of fifty percent (50%) or more of the
+        outstanding shares, or (iii) beneficial ownership of such entity.
+
+        "You" (or "Your") shall mean an individual or Legal Entity
+        exercising permissions granted by this License.
+
+        "Source" form shall mean the preferred form for making modifications,
+        including but not limited to software source code, documentation
+        source, and configuration files.
+
+        "Object" form shall mean any form resulting from mechanical
+        transformation or translation of a Source form, including but
+        not limited to compiled object code, generated documentation,
+        and conversions to other media types.
+
+        "Work" shall mean the work of authorship, whether in Source or
+        Object form, made available under the License, as indicated by a
+        copyright notice that is included in or attached to the work
+        (an example is provided in the Appendix below).
+
+        "Derivative Works" shall mean any work, whether in Source or Object
+        form, that is based on (or derived from) the Work and for which the
+        editorial revisions, annotations, elaborations, or other modifications
+        represent, as a whole, an original work of authorship. For the purposes
+        of this License, Derivative Works shall not include works that remain
+        separable from, or merely link (or bind by name) to the interfaces of,
+        the Work and Derivative Works thereof.
+
+        "Contribution" shall mean any work of authorship, including
+        the original version of the Work and any modifications or additions
+        to that Work or Derivative Works thereof, that is intentionally
+        submitted to Licensor for inclusion in the Work by the copyright owner
+        or by an individual or Legal Entity authorized to submit on behalf of
+        the copyright owner. For the purposes of this definition, "submitted"
+        means any form of electronic, verbal, or written communication sent
+        to the Licensor or its representatives, including but not limited to
+        communication on electronic mailing lists, source code control systems,
+        and issue tracking systems that are managed by, or on behalf of, the
+        Licensor for the purpose of discussing and improving the Work, but
+        excluding communication that is conspicuously marked or otherwise
+        designated in writing by the copyright owner as "Not a Contribution."
+
+        "Contributor" shall mean Licensor and any individual or Legal Entity
+        on behalf of whom a Contribution has been received by Licensor and
+        subsequently incorporated within the Work.
+
+    2. Grant of Copyright License. Subject to the terms and conditions of
+        this License, each Contributor hereby grants to You a perpetual,
+        worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+        copyright license to reproduce, prepare Derivative Works of,
+        publicly display, publicly perform, sublicense, and distribute the
+        Work and such Derivative Works in Source or Object form.
+
+    3. Grant of Patent License. Subject to the terms and conditions of
+        this License, each Contributor hereby grants to You a perpetual,
+        worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+        (except as stated in this section) patent license to make, have made,
+        use, offer to sell, sell, import, and otherwise transfer the Work,
+        where such license applies only to those patent claims licensable
+        by such Contributor that are necessarily infringed by their
+        Contribution(s) alone or by combination of their Contribution(s)
+        with the Work to which such Contribution(s) was submitted. If You
+        institute patent litigation against any entity (including a
+        cross-claim or counterclaim in a lawsuit) alleging that the Work
+        or a Contribution incorporated within the Work constitutes direct
+        or contributory patent infringement, then any patent licenses
+        granted to You under this License for that Work shall terminate
+        as of the date such litigation is filed.
+
+    4. Redistribution. You may reproduce and distribute copies of the
+        Work or Derivative Works thereof in any medium, with or without
+        modifications, and in Source or Object form, provided that You
+        meet the following conditions:
+
+        (a) You must give any other recipients of the Work or
+            Derivative Works a copy of this License; and
+
+        (b) You must cause any modified files to carry prominent notices
+            stating that You changed the files; and
+
+        (c) You must retain, in the Source form of any Derivative Works
+            that You distribute, all copyright, patent, trademark, and
+            attribution notices from the Source form of the Work,
+            excluding those notices that do not pertain to any part of
+            the Derivative Works; and
+
+        (d) If the Work includes a "NOTICE" text file as part of its
+            distribution, then any Derivative Works that You distribute must
+            include a readable copy of the attribution notices contained
+            within such NOTICE file, excluding those notices that do not
+            pertain to any part of the Derivative Works, in at least one
+            of the following places: within a NOTICE text file distributed
+            as part of the Derivative Works; within the Source form or
+            documentation, if provided along with the Derivative Works; or,
+            within a display generated by the Derivative Works, if and
+            wherever such third-party notices normally appear. The contents
+            of the NOTICE file are for informational purposes only and
+            do not modify the License. You may add Your own attribution
+            notices within Derivative Works that You distribute, alongside
+            or as an addendum to the NOTICE text from the Work, provided
+            that such additional attribution notices cannot be construed
+            as modifying the License.
+
+        You may add Your own copyright statement to Your modifications and
+        may provide additional or different license terms and conditions
+        for use, reproduction, or distribution of Your modifications, or
+        for any such Derivative Works as a whole, provided Your use,
+        reproduction, and distribution of the Work otherwise complies with
+        the conditions stated in this License.
+
+    5. Submission of Contributions. Unless You explicitly state otherwise,
+        any Contribution intentionally submitted for inclusion in the Work
+        by You to the Licensor shall be under the terms and conditions of
+        this License, without any additional terms or conditions.
+        Notwithstanding the above, nothing herein shall supersede or modify
+        the terms of any separate license agreement you may have executed
+        with Licensor regarding such Contributions.
+
+    6. Trademarks. This License does not grant permission to use the trade
+        names, trademarks, service marks, or product names of the Licensor,
+        except as required for reasonable and customary use in describing the
+        origin of the Work and reproducing the content of the NOTICE file.
+
+    7. Disclaimer of Warranty. Unless required by applicable law or
+        agreed to in writing, Licensor provides the Work (and each
+        Contributor provides its Contributions) on an "AS IS" BASIS,
+        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+        implied, including, without limitation, any warranties or conditions
+        of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+        PARTICULAR PURPOSE. You are solely responsible for determining the
+        appropriateness of using or redistributing the Work and assume any
+        risks associated with Your exercise of permissions under this License.
+
+    8. Limitation of Liability. In no event and under no legal theory,
+        whether in tort (including negligence), contract, or otherwise,
+        unless required by applicable law (such as deliberate and grossly
+        negligent acts) or agreed to in writing, shall any Contributor be
+        liable to You for damages, including any direct, indirect, special,
+        incidental, or consequential damages of any character arising as a
+        result of this License or out of the use or inability to use the
+        Work (including but not limited to damages for loss of goodwill,
+        work stoppage, computer failure or malfunction, or any and all
+        other commercial damages or losses), even if such Contributor
+        has been advised of the possibility of such damages.
+
+    9. Accepting Warranty or Additional Liability. While redistributing
+        the Work or Derivative Works thereof, You may choose to offer,
+        and charge a fee for, acceptance of support, warranty, indemnity,
+        or other liability obligations and/or rights consistent with this
+        License. However, in accepting such obligations, You may act only
+        on Your own behalf and on Your sole responsibility, not on behalf
+        of any other Contributor, and only if You agree to indemnify,
+        defend, and hold each Contributor harmless for any liability
+        incurred by, or claims asserted against, such Contributor by reason
+        of your accepting any such warranty or additional liability.
+
+    END OF TERMS AND CONDITIONS
+
+    APPENDIX: How to apply the Apache License to your work.
+
+        To apply the Apache License to your work, attach the following
+        boilerplate notice, with the fields enclosed by brackets "[]"
+        replaced with your own identifying information. (Don't include
+        the brackets!)  The text should be enclosed in the appropriate
+        comment syntax for the file format. We also recommend that a
+        file or class name and description of purpose be included on the
+        same "printed page" as the copyright notice for easier
+        identification within third-party archives.
+
+    Copyright [yyyy] [name of copyright owner]
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
--- a/src/axolotl/integrations/cut_cross_entropy/LICENSE
+++ b/src/axolotl/integrations/cut_cross_entropy/LICENSE
@@ -0,0 +1,47 @@
+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+
+IMPORTANT:  This Apple software is supplied to you by Apple
+Inc. ("Apple") in consideration of your agreement to the following
+terms, and your use, installation, modification or redistribution of
+this Apple software constitutes acceptance of these terms.  If you do
+not agree with these terms, please do not use, install, modify or
+redistribute this Apple software.
+
+In consideration of your agreement to abide by the following terms, and
+subject to these terms, Apple grants you a personal, non-exclusive
+license, under Apple's copyrights in this original Apple software (the
+"Apple Software"), to use, reproduce, modify and redistribute the Apple
+Software, with or without modifications, in source and/or binary forms;
+provided that if you redistribute the Apple Software in its entirety and
+without modifications, you must retain this notice and the following
+text and disclaimers in all such redistributions of the Apple Software.
+Neither the name, trademarks, service marks or logos of Apple Inc. may
+be used to endorse or promote products derived from the Apple Software
+without specific prior written permission from Apple.  Except as
+expressly stated in this notice, no other rights or licenses, express or
+implied, are granted by Apple herein, including but not limited to any
+patent rights that may be infringed by your derivative works or by other
+works in which the Apple Software may be incorporated.
+
+The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
+MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
+THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
+FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
+OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.
+
+IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
+OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
+MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
+AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
+STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+
+-------------------------------------------------------------------------------
+SOFTWARE DISTRIBUTED WITH CUT CROSS ENTROPY:
+
+The Cut Cross Entropy software includes a number of subcomponents with separate
+copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.md.
+-------------------------------------------------------------------------------
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -0,0 +1,10 @@
+# Cut Cross Entropy
+
+### Usage
+
+```yaml
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+cut_cross_entropy: true
+```
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -0,0 +1,83 @@
+# Copyright 2024 Axolotl AI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Module for the Plugin for Cut Cross Entropy integration with Axolotl.
+
+Cut Cross Entropy is an optimized implementation of cross entropy loss
+from Apple's ML team.
+"""
+import importlib
+import logging
+
+import torch
+
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils import get_pytorch_version
+
+from ...utils.distributed import zero_only
+from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
+
+LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy")
+
+_CCE_INSTALL_MESSAGE = (
+    "Please install cut_cross_entropy with transformers support using "
+    '`pip install "cut-cross-entropy[transformers]==24.11.4"`'
+)
+
+
+class CutCrossEntropyPlugin(BasePlugin):
+    """
+    Plugin for Cut Cross Entropy integration with Axolotl.
+    """
+
+    def get_input_args(self):
+        return "axolotl.integrations.cut_cross_entropy.CutCrossEntropyArgs"
+
+    def _check_requirements(self):
+        """Check if all requirements are met."""
+        # Check PyTorch version
+
+        major, minor, _ = get_pytorch_version()
+        if (major, minor) < (2, 4):
+            raise ImportError(
+                "Cut Cross Entropy requires PyTorch >= 2.4.0. "
+                f"Current version: {torch.__version__}"
+            )
+
+        # Check if cut_cross_entropy is installed
+        cce_spec = importlib.util.find_spec("cut_cross_entropy")
+        if cce_spec is None:
+            raise ImportError(_CCE_INSTALL_MESSAGE)
+
+        cce_spec_transformers = importlib.util.find_spec(
+            "cut_cross_entropy.transformers"
+        )
+        if cce_spec_transformers is None:
+            raise ImportError(_CCE_INSTALL_MESSAGE)
+
+    def pre_model_load(self, cfg):
+        """Apply cut cross entropy before model loading if enabled."""
+        if cfg.cut_cross_entropy:
+            self._check_requirements()
+
+            from cut_cross_entropy.transformers import cce_patch
+
+            with zero_only():
+                LOG.info(
+                    f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
+                )
+
+            # The patch checks model_type internally
+            cce_patch(cfg.model_config_type)
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -0,0 +1,42 @@
+# Copyright 2024 Axolotl AI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Module for handling Cut Cross Entropy input arguments.
+"""
+import logging
+from typing import Optional
+
+from pydantic import BaseModel, model_validator
+
+LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy.args")
+
+
+class CutCrossEntropyArgs(BaseModel):
+    """
+    Input args for Cut Cross Entropy.
+    """
+
+    cut_cross_entropy: Optional[bool] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_dtype_is_half(cls, data):
+        if data.get("cut_cross_entropy") and not (data.get("bf16") or data.get("fp16")):
+            raise ValueError(
+                "Cut Cross Entropy requires fp16/bf16 training for backward pass. "
+                "Please set `bf16` or `fp16` to `True`."
+            )
+
+        return data
--- a/src/axolotl/integrations/grokfast/LICENSE
+++ b/src/axolotl/integrations/grokfast/LICENSE
@@ -1,6 +1,6 @@
 MIT License

-Copyright (c) 2024 Nikhil Vyas
+Copyright (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/src/axolotl/integrations/grokfast/README.md
+++ b/src/axolotl/integrations/grokfast/README.md
@@ -0,0 +1,13 @@
+# Grokfast Optimizer
+
+See https://github.com/ironjr/grokfast
+
+### Usage
+
+```yaml
+plugins:
+  - axolotl.integrations.grokfast.GrokfastPlugin
+
+grokfast_alpha: 2.0
+grokfast_lamb: 0.98
+```
--- a/src/axolotl/integrations/grokfast/init.py
+++ b/src/axolotl/integrations/grokfast/init.py
@@ -0,0 +1,50 @@
+"""
+Grokfast plugin for Axolotl
+"""
+import logging
+
+from transformers.trainer_callback import TrainerCallback
+
+from ..base import BasePlugin
+from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
+from .optimizer import gradfilter_ema
+
+LOG = logging.getLogger("axolotl.integrations.grokfast")
+
+
+class GrokfastCallbackHandler(TrainerCallback):
+    """
+    Transformer trainer callbacks for Grokfast
+    """
+
+    def __init__(self, *args_, alpha=0.98, lamb=2.0, **kwargs):
+        super().__init__(*args_, **kwargs)
+        self.grads = None
+        self.alpha = alpha
+        self.lamb = lamb
+
+    def on_train_begin(self, *args_, **kwargs):  # pylint: disable=unused-argument
+        self.grads = None
+
+    def on_pre_optimizer_step(
+        self, args_, state, control, **kwargs
+    ):  # pylint: disable=unused-argument
+        model = kwargs.pop("model")
+        self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
+        return control
+
+
+class GrokfastPlugin(BasePlugin):
+    """
+    Plugin for Grokfast optimizer integraton with Axolotl.
+    """
+
+    def get_input_args(self):
+        return "axolotl.integrations.grokfast.GrokfastArgs"
+
+    def add_callbacks_post_trainer(self, cfg, trainer):
+        LOG.info("Adding Grokfast callback to the trainer")
+        callback = GrokfastCallbackHandler(
+            alpha=cfg.grokfast_alpha, lamb=cfg.grokfast_lamb
+        )
+        return [callback]
--- a/src/axolotl/integrations/grokfast/args.py
+++ b/src/axolotl/integrations/grokfast/args.py
@@ -0,0 +1,15 @@
+"""
+config args for grokfast plugin
+"""
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class GrokfastArgs(BaseModel):
+    """
+    Input args for Grokfast optimizer.
+    """
+
+    grokfast_alpha: Optional[float] = 0.98
+    grokfast_lamb: Optional[float] = 2.0
--- a/src/axolotl/integrations/grokfast/optimizer.py
+++ b/src/axolotl/integrations/grokfast/optimizer.py
@@ -0,0 +1,63 @@
+# Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
+# Reference: https://github.com/ironjr/grokfast
+
+# pylint: skip-file
+from collections import deque
+from typing import Dict, Literal, Optional
+
+import torch
+import torch.nn as nn
+
+
+def gradfilter_ma(
+    m: nn.Module,
+    grads: Optional[Dict[str, deque]] = None,
+    window_size: int = 100,
+    lamb: float = 5.0,
+    filter_type: Literal["mean", "sum"] = "mean",
+    warmup: bool = True,
+    trigger: bool = False,  # For ablation study.
+) -> Dict[str, deque]:
+    if grads is None:
+        grads = {
+            n: deque(maxlen=window_size)
+            for n, p in m.named_parameters()
+            if p.requires_grad and p.grad is not None
+        }
+
+    for n, p in m.named_parameters():
+        if p.requires_grad and p.grad is not None:
+            grads[n].append(p.grad.data.detach())  # .cpu())
+
+            # Modify the gradients.
+            if not warmup or len(grads[n]) == window_size and not trigger:
+                if filter_type == "mean":
+                    avg = sum(grads[n]) / len(grads[n])
+                elif filter_type == "sum":
+                    avg = sum(grads[n])
+                else:
+                    raise ValueError(f"Unrecognized filter_type {filter_type}")
+                p.grad.data = p.grad.data + avg * lamb
+
+    return grads
+
+
+def gradfilter_ema(
+    m: nn.Module,
+    grads: Optional[Dict[str, torch.Tensor]] = None,
+    alpha: float = 0.98,
+    lamb: float = 2.0,
+) -> Dict[str, torch.Tensor]:
+    if grads is None:
+        grads = {
+            n: p.grad.data.detach()
+            for n, p in m.named_parameters()
+            if p.requires_grad and p.grad is not None
+        }
+
+    for n, p in m.named_parameters():
+        if p.requires_grad and p.grad is not None:
+            grads[n] = grads[n] * alpha + p.grad.data.detach() * (1 - alpha)
+            p.grad.data = p.grad.data + grads[n] * lamb
+
+    return grads
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -18,20 +18,24 @@ Module for the Plugin for LIGER integraton with Axolotl.
 Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
+import inspect
 import logging
 import sys
-from functools import partial

 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
-from liger_kernel.transformers.geglu import LigerGEGLUMLP
+from liger_kernel.transformers.functional import liger_cross_entropy
+from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
 from liger_kernel.transformers.rms_norm import LigerRMSNorm
 from liger_kernel.transformers.rope import liger_rotary_pos_emb
 from liger_kernel.transformers.swiglu import LigerSwiGLUMLP

 from axolotl.integrations.base import BasePlugin

+from ...utils.distributed import zero_only
 from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401

+LOG = logging.getLogger("axolotl.integrations.liger")
+

 class LigerPlugin(BasePlugin):
    """
@@ -42,59 +46,31 @@ class LigerPlugin(BasePlugin):
        return "axolotl.integrations.liger.LigerArgs"

    def pre_model_load(self, cfg):
-        if cfg.model_config_type == "llama":
-            from liger_kernel.transformers.model.llama import (
-                lce_forward as llama_lce_forward,
-            )
-            from transformers.models.llama import modeling_llama
-
-            if cfg.liger_rope:
-                modeling_llama.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-                modeling_llama.LlamaRMSNorm = LigerRMSNorm
-            if cfg.liger_swiglu:
-                modeling_llama.LlamaMLP = LigerSwiGLUMLP
-            if cfg.liger_cross_entropy:
-                modeling_llama.CrossEntropyLoss = LigerCrossEntropyLoss
-            elif cfg.liger_fused_linear_cross_entropy:
-                modeling_llama.LlamaForCausalLM.forward = llama_lce_forward
-
-        elif cfg.model_config_type == "mistral":
-            from liger_kernel.transformers.model.mistral import (
-                lce_forward as mistral_lce_forward,
-            )
-            from transformers.models.mistral import modeling_mistral
-
-            if cfg.liger_rope:
-                modeling_mistral.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-                modeling_mistral.MistralRMSNorm = LigerRMSNorm
-            if cfg.liger_swiglu:
-                modeling_mistral.MistralMLP = LigerSwiGLUMLP
-            if cfg.liger_cross_entropy:
-                modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
-            if cfg.liger_fused_linear_cross_entropy:
-                modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
-
-        elif cfg.model_config_type == "gemma":
-            from liger_kernel.transformers.model.gemma import (
-                lce_forward as gemma_lce_forward,
-            )
-            from transformers.models.gemma import modeling_gemma
-
-            if cfg.liger_rope:
-                modeling_gemma.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-                modeling_gemma.GemmaRMSNorm = partial(
-                    LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
+        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
+            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
+            liger_fn_sig = inspect.signature(apply_liger_fn)
+            kwargs = {}
+            if "rope" in liger_fn_sig.parameters:
+                kwargs["rope"] = cfg.liger_rope
+            if "cross_entropy" in liger_fn_sig.parameters:
+                kwargs["cross_entropy"] = cfg.liger_cross_entropy
+            if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
+                kwargs[
+                    "fused_linear_cross_entropy"
+                ] = cfg.liger_fused_linear_cross_entropy
+            if "rms_norm" in liger_fn_sig.parameters:
+                kwargs["rms_norm"] = cfg.liger_rms_norm
+            if "layer_norm" in liger_fn_sig.parameters:
+                kwargs["layer_norm"] = cfg.liger_layer_norm
+            if "geglu" in liger_fn_sig.parameters:
+                kwargs["geglu"] = cfg.liger_glu_activation
+            elif "swiglu" in liger_fn_sig.parameters:
+                kwargs["swiglu"] = cfg.liger_glu_activation
+            with zero_only():
+                LOG.info(
+                    f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
                )
-            if cfg.liger_swiglu:
-                modeling_gemma.GemmaMLP = LigerGEGLUMLP
-            if cfg.liger_cross_entropy:
-                modeling_gemma.CrossEntropyLoss = LigerCrossEntropyLoss
-            if cfg.liger_fused_linear_cross_entropy:
-                modeling_gemma.GemmaForCausalLM.forward = gemma_lce_forward
-
+            apply_liger_fn(**kwargs)
        elif cfg.model_config_type == "jamba":
            from transformers.models.jamba import modeling_jamba

@@ -104,30 +80,14 @@ class LigerPlugin(BasePlugin):
                modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
            if cfg.liger_rms_norm:
                modeling_jamba.JambaRMSNorm = LigerRMSNorm
-            if cfg.liger_swiglu:
+            if cfg.liger_glu_activation:
                modeling_jamba.JambaMLP = LigerSwiGLUMLP
            if cfg.liger_cross_entropy:
-                modeling_jamba.CrossEntropyLoss = LigerCrossEntropyLoss
+                from transformers.loss.loss_utils import nn
+
+                nn.functional.cross_entropy = liger_cross_entropy
            if cfg.liger_fused_linear_cross_entropy:
                modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
-
-        elif cfg.model_config_type == "qwen2":
-            from liger_kernel.transformers.model.qwen2 import (
-                lce_forward as qwen2_lce_forward,
-            )
-            from transformers.models.qwen2 import modeling_qwen2
-
-            if cfg.liger_rope:
-                modeling_qwen2.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-                modeling_qwen2.Qwen2RMSNorm = LigerRMSNorm
-            if cfg.liger_swiglu:
-                modeling_qwen2.Qwen2MLP = LigerSwiGLUMLP
-            if cfg.liger_cross_entropy:
-                modeling_qwen2.CrossEntropyLoss = LigerCrossEntropyLoss
-            if cfg.liger_fused_linear_cross_entropy:
-                modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
-
        elif cfg.model_config_type == "deepseek_v2":
            from accelerate import init_empty_weights
            from transformers import AutoModelForCausalLM
@@ -146,44 +106,11 @@ class LigerPlugin(BasePlugin):
                logging.warning("Fused liger_rope is not supported for DeepseekV2.")
            if cfg.liger_rms_norm:
                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
-            if cfg.liger_swiglu:
+            if cfg.liger_glu_activation:
                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
            if cfg.liger_cross_entropy:
+                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
+                # nn.CrossEntropyLoss in the forward method.
                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
            if cfg.liger_fused_linear_cross_entropy:
                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
-
-        elif cfg.model_config_type == "gemma2":
-            from transformers.models.gemma2 import modeling_gemma2
-
-            if cfg.liger_rope:
-                modeling_gemma2.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-                modeling_gemma2.Gemma2RMSNorm = partial(
-                    LigerRMSNorm, offset=1.0, init_fn="zeros", casting_mode="gemma"
-                )
-            if cfg.liger_swiglu:
-                modeling_gemma2.Gemma2MLP = LigerGEGLUMLP
-            if cfg.liger_cross_entropy:
-                modeling_gemma2.CrossEntropyLoss = LigerCrossEntropyLoss
-            if cfg.liger_fused_linear_cross_entropy:
-                logging.warning(
-                    "Fused linear cross entropy is not supported for Gemma 2."
-                )
-
-        elif cfg.model_config_type == "phi3":
-            from liger_kernel.transformers.model.phi3 import (
-                lce_forward as phi3_lce_forward,
-            )
-            from transformers.models.phi3 import modeling_phi3
-
-            if cfg.liger_rope:
-                modeling_phi3.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-                modeling_phi3.Phi3RMSNorm = LigerRMSNorm
-            if cfg.liger_swiglu:
-                modeling_phi3.Phi3MLP = LigerSwiGLUMLP
-            if cfg.liger_cross_entropy:
-                modeling_phi3.CrossEntropyLoss = LigerCrossEntropyLoss
-            if cfg.liger_fused_linear_cross_entropy:
-                modeling_phi3.Phi3ForCausalLM.forward = phi3_lce_forward
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -15,9 +15,12 @@
 """
 Module for handling LIGER input arguments.
 """
+import logging
 from typing import Optional

-from pydantic import BaseModel
+from pydantic import BaseModel, model_validator
+
+LOG = logging.getLogger("axolotl.integrations.liger.args")


 class LigerArgs(BaseModel):
@@ -27,6 +30,26 @@ class LigerArgs(BaseModel):

    liger_rope: Optional[bool] = None
    liger_rms_norm: Optional[bool] = None
+    liger_layer_norm: Optional[bool] = None
    liger_swiglu: Optional[bool] = None
+    liger_glu_activation: Optional[bool] = None
    liger_cross_entropy: Optional[bool] = None
    liger_fused_linear_cross_entropy: Optional[bool] = None
+
+    liger_pref_rl: Optional[bool] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_deprecated_swiglu(cls, data):
+        if data.get("liger_swiglu") is not None:
+            if data.get("liger_glu_activation") is not None:
+                raise ValueError(
+                    "You cannot have both `liger_swiglu` and `liger_glu_activation` set."
+                )
+
+            LOG.warning(
+                "The 'liger_swiglu' argument is deprecated and will be removed in a future release. "
+                "Please use 'liger_glu_activation' instead."
+            )
+            data["liger_glu_activation"] = data.pop("liger_swiglu")
+        return data
--- a/src/axolotl/integrations/liger/trainer/init.py
+++ b/src/axolotl/integrations/liger/trainer/init.py
--- a/src/axolotl/integrations/liger/trainer/dpo_trainer.py
+++ b/src/axolotl/integrations/liger/trainer/dpo_trainer.py
@@ -0,0 +1,253 @@
+"""
+integration of liger dpo kernels with dpotrainer
+"""
+from typing import Dict, List, Literal, Union
+
+import torch
+from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
+from liger_kernel.transformers.trainer.orpo_trainer import _FSDPForwardRedirection
+from torch import nn
+from torch.distributed.fsdp import FullyShardedDataParallel
+
+from axolotl.core.trainers.base import AxolotlDPOTrainer
+
+
+class AxolotlLigerDPOTrainer(AxolotlDPOTrainer):
+    """
+    Extend the DPO Trainer to use LIGER kernels for DPO
+    """
+
+    def concatenated_forward(
+        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
+    ):
+        """
+        Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together,
+        and compute the DPO loss using Liger's fused kernel.
+
+        This method replaces the original `concatenated_forward` implementation to use Liger.
+        """
+
+        # Prepare concatenated inputs
+        concatenated_batch = self.concatenated_inputs(batch, self.padding_value)
+
+        # Extract concatenated inputs
+        prompt_input_ids = concatenated_batch["prompt_input_ids"]
+        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
+        completion_input_ids = concatenated_batch["completion_input_ids"]
+        completion_attention_mask = concatenated_batch["completion_attention_mask"]
+
+        # For encoder-decoder models, you'd need to construct decoder_input_ids, etc.
+        # This example assumes a causal decoder-only model.
+        input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1)
+        attention_mask = torch.cat(
+            (prompt_attention_mask, completion_attention_mask), dim=1
+        )
+
+        # Align inputs by removing leading padding
+        for i in range(attention_mask.size(0)):
+            first_one_idx = torch.nonzero(attention_mask[i])[0].item()
+            input_ids[i] = torch.roll(input_ids[i], shifts=-first_one_idx)
+            attention_mask[i] = torch.roll(attention_mask[i], shifts=-first_one_idx)
+
+        # Remove trailing empty columns
+        empty_cols = torch.sum(attention_mask, dim=0) == 0
+        if empty_cols.any():
+            first_empty_col = torch.nonzero(empty_cols)[0].item()
+            input_ids = input_ids[:, :first_empty_col]
+            attention_mask = attention_mask[:, :first_empty_col]
+
+        if self.args.max_length is not None:
+            input_ids = input_ids[:, : self.args.max_length]
+            attention_mask = attention_mask[:, : self.args.max_length]
+
+        # Labels are completion_input_ids shifted by one token right
+        # For causal LM, labels are the completion part only
+        labels = torch.cat(
+            (torch.zeros_like(prompt_input_ids), completion_input_ids), dim=1
+        )
+        labels = labels[:, 1:]  # shift left by one
+        attention_mask = attention_mask[:, 1:]
+        labels = labels[:, : attention_mask.size(1)]
+
+        # Mask out the prompt portion from loss
+        labels[~attention_mask.bool()] = self.label_pad_token_id
+
+        # Prepare reference model hidden states if ref_model exists
+        use_ref_model = self.ref_model is not None and not self.reference_free
+
+        # Run main model forward to get hidden states
+        # If using FSDP, redirect forward calls
+        if isinstance(model, FullyShardedDataParallel):
+            outputs = _FSDPForwardRedirection()(
+                model,
+                model._fsdp_wrapped_module.model,  # pylint: disable=protected-access
+                input_ids,
+                attention_mask=attention_mask,
+                use_cache=False,
+            )
+        else:
+            # If model is a DataParallel, unwrap
+            if isinstance(model, torch.nn.DataParallel):
+                model = model.module
+            outputs = model.model(
+                input_ids, attention_mask=attention_mask, use_cache=False
+            )
+
+        last_hidden_state = outputs.last_hidden_state
+
+        ref_last_hidden_state = None
+        if use_ref_model:
+            ref_model = self.ref_model
+            if isinstance(ref_model, FullyShardedDataParallel):
+                with torch.no_grad():
+                    ref_outputs = _FSDPForwardRedirection()(
+                        ref_model,
+                        ref_model._fsdp_wrapped_module.model,  # pylint: disable=protected-accessåå
+                        input_ids,
+                        attention_mask=attention_mask,
+                        use_cache=False,
+                    )
+            else:
+                if isinstance(ref_model, torch.nn.DataParallel):
+                    ref_model = ref_model.module
+                with torch.no_grad():
+                    ref_outputs = ref_model.model(
+                        input_ids, attention_mask=attention_mask, use_cache=False
+                    )
+            ref_last_hidden_state = ref_outputs.last_hidden_state
+
+        # Retrieve lm_head parameters
+        lm_head = model.lm_head
+        ref_lm_head = (
+            self.ref_model.lm_head
+            if (use_ref_model and self.ref_model is not None)
+            else None
+        )
+
+        # Use Liger fused DPO loss
+        dpo_loss_fn = LigerFusedLinearDPOLoss(
+            ignore_index=self.label_pad_token_id,
+            beta=self.beta,
+            compute_nll_loss=False,
+            compiled=True,
+            use_ref_model=use_ref_model,
+        )
+
+        # call fused Liger DPO
+        if use_ref_model:
+            loss_acc, aux_outputs = dpo_loss_fn(
+                lm_head.weight,  # lin_weight
+                last_hidden_state,  # _input
+                labels,  # target
+                bias=lm_head.bias,
+                ref_input=ref_last_hidden_state,
+                ref_weight=ref_lm_head.weight,
+                ref_bias=ref_lm_head.bias,
+            )
+
+            (
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_chosen_logits_mean,
+                policy_rejected_logits_mean,
+                policy_nll_loss,
+            ) = aux_outputs[:5]
+
+        else:
+            # No reference model scenario: Liger kernel treats ref_logps as 0
+            loss_acc, aux_outputs = dpo_loss_fn(
+                lm_head.weight,
+                last_hidden_state,
+                labels,
+                bias=lm_head.bias,
+            )
+            (
+                policy_chosen_logps,
+                policy_rejected_logps,
+                policy_chosen_logits_mean,
+                policy_rejected_logits_mean,
+                policy_nll_loss,
+            ) = aux_outputs[:5]
+
+        # Add aux loss if enabled
+        if self.aux_loss_enabled and hasattr(outputs, "aux_loss"):
+            loss_acc = loss_acc + self.aux_loss_coef * outputs.aux_loss
+
+        # Add RPO loss if requested (RPO is a variant that adds NLL loss)
+        if self.args.rpo_alpha is not None:
+            # policy_nll_loss: average negative log-likelihood of chosen completions
+            loss_acc = loss_acc + self.args.rpo_alpha * policy_nll_loss.mean()
+
+        return (
+            loss_acc,
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits_mean,
+            policy_rejected_logits_mean,
+            policy_nll_loss,
+        )
+
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: Dict[str, Union[List, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """
+        Compute the DPO loss and other metrics for a given batch using the Liger fused kernel.
+        """
+        metrics = {}
+
+        (
+            loss,
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits_mean,
+            policy_rejected_logits_mean,
+            policy_nll_loss,
+        ) = self.concatenated_forward(model, batch)
+
+        # For metrics, we approximate chosen/rejected rewards as beta * (log π(y) - log π_ref(y)) if ref model used.
+        # If no ref model is used, we can't compute reward_accuracies meaningfully. For simplicity, we assume ref_model presence.
+        if self.ref_model is not None and not self.reference_free:
+            # If you want full parity with original DPOTrainer metrics (like chosen_rewards, rejected_rewards),
+            # you'd need to run reference forward or store reference log ps. The Liger kernel currently doesn't
+            # return ref_chosen_logps/ref_rejected_logps explicitly. By design, Liger directly computes DPO.
+            #
+            # Here we approximate chosen_rewards and rejected_rewards from the difference in chosen/rejected logps.
+            # Since Liger DPO does not output ref logps separately, you may need to modify the Liger kernel to
+            # also output them if you need all the metrics. For now, we'll skip them or provide a placeholder.
+
+            # Placeholder: chosen/rejected "rewards" can't be retrieved directly from Liger as-is.
+            # If needed, integrate ref_chosen_logps/ref_rejected_logps into Liger kernel returns.
+            chosen_rewards = policy_chosen_logps * self.beta  # approximation
+            rejected_rewards = policy_rejected_logps * self.beta  # approximation
+            reward_accuracies = (chosen_rewards > rejected_rewards).float()
+            metrics[f"{train_eval}_rewards/chosen"] = chosen_rewards.mean().cpu().item()
+            metrics[f"{train_eval}_rewards/rejected"] = (
+                rejected_rewards.mean().cpu().item()
+            )
+            metrics[f"{train_eval}_rewards/accuracies"] = (
+                reward_accuracies.mean().cpu().item()
+            )
+            metrics[f"{train_eval}_rewards/margins"] = (
+                (chosen_rewards - rejected_rewards).mean().cpu().item()
+            )
+
+        metrics[f"{train_eval}_logps/chosen"] = policy_chosen_logps.mean().cpu().item()
+        metrics[f"{train_eval}_logps/rejected"] = (
+            policy_rejected_logps.mean().cpu().item()
+        )
+        metrics[f"{train_eval}_logits/chosen"] = (
+            policy_chosen_logits_mean.detach().cpu().item()
+        )
+        metrics[f"{train_eval}_logits/rejected"] = (
+            policy_rejected_logits_mean.detach().cpu().item()
+        )
+
+        if self.args.rpo_alpha is not None:
+            metrics[f"{train_eval}_nll_loss"] = (
+                policy_nll_loss.mean().detach().cpu().item()
+            )
+
+        return loss.mean(), metrics
--- a/src/axolotl/monkeypatch/init.py
+++ b/src/axolotl/monkeypatch/init.py
--- a/src/axolotl/monkeypatch/attention/init.py
+++ b/src/axolotl/monkeypatch/attention/init.py
--- a/src/axolotl/monkeypatch/fastchat_conversation_turns.py
+++ b/src/axolotl/monkeypatch/fastchat_conversation_turns.py
@@ -1,231 +0,0 @@
-"""
-monkeypatch to add a get_turns method
-"""
-
-import logging
-from typing import Generator, Tuple
-
-from fastchat.conversation import SeparatorStyle
-
-LOG = logging.getLogger("axolotl.monkeypatch.fastchat_conversation_turns")
-
-
-def get_prompt(self) -> str:
-    ret = ""
-    for role, msg in self.get_turns():
-        ret += role + msg
-    return ret
-
-
-def get_turns(  # pylint: disable=too-many-return-statements
-    self,
-) -> Generator[Tuple[str, str], None, None]:
-    """Get the prompt for generation."""
-    system_prompt = self.system_template.format(system_message=self.system_message)
-    if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
-        yield "", system_prompt + self.sep
-        for role, message in self.messages:
-            if message:
-                yield role + ": ", message + self.sep
-            else:
-                yield role + ":", ""
-        return
-    if self.sep_style == SeparatorStyle.ADD_COLON_TWO:
-        seps = [self.sep, self.sep2]
-        yield "", system_prompt + seps[0]
-        for i, (role, message) in enumerate(self.messages):
-            if message:
-                yield role + ": ", message + seps[i % 2]
-            else:
-                yield role + ":", ""
-        return
-    if self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
-        yield "", system_prompt + self.sep
-        for role, message in self.messages:
-            if message:
-                yield role + ": ", message + self.sep
-            else:
-                yield role + ": ", ""  # must be end with a space
-        return
-    if self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
-        yield "", "" if system_prompt == "" else system_prompt + self.sep
-        for role, message in self.messages:
-            if message:
-                yield role + "\n", message + self.sep
-            else:
-                yield role + "\n", ""
-        return
-    if self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
-        yield "", system_prompt
-        for role, message in self.messages:
-            if message:
-                yield role, message + self.sep
-            else:
-                yield role, ""
-        return
-    if self.sep_style == SeparatorStyle.NO_COLON_TWO:
-        seps = [self.sep, self.sep2]
-        yield "", system_prompt
-        for i, (role, message) in enumerate(self.messages):
-            if message:
-                yield role, message + seps[i % 2]
-            else:
-                yield role, ""
-        return
-    if self.sep_style == SeparatorStyle.RWKV:
-        yield "", system_prompt
-        for i, (role, message) in enumerate(self.messages):
-            if message:
-                yield role + ": ", message.replace("\r\n", "\n").replace(
-                    "\n\n", "\n"
-                ) + "\n\n"
-            else:
-                yield role + ":", ""
-        return
-    if self.sep_style == SeparatorStyle.LLAMA2 and self.name != "mistral":
-        if self.system_message:
-            if self.messages:
-                # For llama, the system message is incorporated into the first human instruction
-                first_role, first_msg = self.messages[0]
-                if first_role == self.roles[0]:
-                    system_prompt += first_msg
-                    self.messages.pop(0)
-            yield "", system_prompt
-        for i, (role, message) in enumerate(self.messages):
-            if message:
-                if (i % 2 == 0 and not self.system_message) or (
-                    i % 2 != 0 and self.system_message
-                ):
-                    role = "<s> " + role
-                yield role + " ", message
-            else:
-                yield role, ""
-        return
-    if self.sep_style == SeparatorStyle.LLAMA2 and self.name == "mistral":
-        contains_sys_msg = False
-        if self.system_message:
-            contains_sys_msg = True
-            if self.messages:
-                # There is no clear guidance on how to handle system messages in Mistral so we just prepend it to the first human instruction separated by a newline
-                first_role, first_msg = self.messages[0]
-                if first_role == self.roles[0]:
-                    system_prompt = self.system_template.format(
-                        system_message=" " + self.system_message
-                    )
-                    system_prompt += first_msg
-                    self.messages.pop(0)
-            yield "", system_prompt
-        for i, (role, message) in enumerate(self.messages):
-            if message and i == 0 and not contains_sys_msg:
-                yield "", system_prompt.strip() + " " + message  # if there is no system message, we need to make sure there is the a `<s> [INST]` at the beginning of the first instruction.
-            elif message:
-                yield role + " ", message
-            else:
-                yield role, ""
-        return
-    if self.sep_style == SeparatorStyle.LLAMA3:
-        if self.system_message:
-            # For llama3, the system message is NOT incorporated into the first human instruction
-            # All messages follow <|start_header_id|>' + role + '<|end_header_id|>\n\n'+ message + '<|eot_id|>
-            yield "", system_prompt
-        for i, (role, message) in enumerate(self.messages):
-            if message:
-                yield f"<|start_header_id|>{role}<|end_header_id|>\n\n", f"{message.strip()}<|eot_id|>"
-            else:
-                yield f"<|start_header_id|>{role}<|end_header_id|>\n\n", ""
-        return
-    if self.sep_style == SeparatorStyle.GEMMA:
-        if self.system_message:
-            raise ValueError("Gemma chat template does not support system messages")
-        for i, (role, message) in enumerate(self.messages):
-            prefix = "<bos>" if i == 0 else ""
-            message_str = message if message else ""
-            yield prefix + "<start_of_turn>" + role + "\n", message_str + "<end_of_turn>\n"
-        return
-    if self.sep_style == SeparatorStyle.CHATGLM:
-        # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
-        # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
-        round_add_n = 1 if self.name == "chatglm2" else 0
-        if system_prompt:
-            yield "", system_prompt + self.sep
-
-        for i, (role, message) in enumerate(self.messages):
-            if i % 2 == 0:
-                yield "", f"[Round {i//2 + round_add_n}]{self.sep}"
-
-            if message:
-                yield f"{role}：", f"{message}{self.sep}"
-            else:
-                yield f"{role}：", ""
-        return
-    if self.sep_style == SeparatorStyle.CHATML:
-        yield "", "" if system_prompt == "" else system_prompt + self.sep + "\n"
-        for role, message in self.messages:
-            if message:
-                yield role + "\n", message + self.sep + "\n"
-            else:
-                yield role + "\n", ""
-        return
-    if self.sep_style == SeparatorStyle.CHATGLM3:
-        if self.system_message:
-            yield "", system_prompt
-        for role, message in self.messages:
-            if message:
-                yield role + "\n", " " + message
-            else:
-                yield role
-        return
-    if self.sep_style == SeparatorStyle.CHATINTERN:
-        # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
-        seps = [self.sep, self.sep2]
-        yield "", system_prompt
-        for i, (role, message) in enumerate(self.messages):
-            prefix = "<s>" if i % 2 == 0 else ""
-            if message:
-                yield prefix + role + ":", message + seps[i % 2] + "\n"
-            else:
-                yield role + ":", ""
-        return
-    if self.sep_style == SeparatorStyle.DOLLY:
-        seps = [self.sep, self.sep2]
-        yield "", system_prompt
-        for i, (role, message) in enumerate(self.messages):
-            if message:
-                suffix = "\n\n" if i % 2 == 1 else ""
-                yield role + ":\n", message + seps[i % 2] + suffix
-            else:
-                yield role + ":\n", ""
-        return
-    if self.sep_style == SeparatorStyle.PHOENIX:
-        yield "", system_prompt
-        for role, message in self.messages:
-            if message:
-                yield role + ": ", "<s>" + message + "</s>"
-            else:
-                yield role + ": " + "<s>", ""
-        return
-    if self.sep_style == SeparatorStyle.ROBIN:
-        yield "", system_prompt + self.sep
-        for role, message in self.messages:
-            if message:
-                yield role + ":\n", message + self.sep
-            else:
-                yield role + ":\n", ""
-        return
-    if self.sep_style == SeparatorStyle.FALCON_CHAT:
-        if self.system_message:
-            yield "", system_prompt + self.sep
-        for role, message in self.messages:
-            if message:
-                yield role + ": ", message + self.sep
-            else:
-                yield role + ":", ""
-    else:
-        raise ValueError(f"Invalid style: {self.sep_style}")
-
-
-def add_get_turns_to_conversation():
-    import fastchat.conversation
-
-    fastchat.conversation.Conversation.get_turns = get_turns
-    fastchat.conversation.Conversation.get_prompt = get_prompt
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -4,7 +4,6 @@

 import logging
 import warnings
-from functools import partial
 from typing import List, Optional, Tuple, Union

 import torch
@@ -94,13 +93,32 @@ def replace_llama_qkv_with_fused(model):
            set_module_name(model, name, qkv)


-def patch_llama_cross_entropy():
-    from flash_attn.losses.cross_entropy import CrossEntropyLoss
-
-    LOG.info("patching with flash_attn.losses.cross_entropy")
-    transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
-        CrossEntropyLoss, inplace_backward=True
+def patch_fa_llama_cross_entropy():
+    LOG.info(
+        "patching transformers.loss.loss_utils.fixed_cross_entropy with flash_attn.ops.triton.cross_entropy"
    )
+    from flash_attn.ops.triton.cross_entropy import (
+        cross_entropy_loss as flash_attn_cross_entropy_loss,
+    )
+
+    def fa2_fixed_cross_entropy(
+        source,
+        target,
+        num_items_in_batch: int = None,
+        ignore_index: int = -100,
+        **kwargs,
+    ):  # pylint: disable=unused-argument
+        reduction = "sum" if num_items_in_batch is not None else "mean"
+        loss, _ = flash_attn_cross_entropy_loss(
+            source, target, ignore_index=ignore_index
+        )
+        if reduction == "sum":
+            loss = loss.sum() / num_items_in_batch
+        else:
+            loss = loss.sum() / (target != ignore_index).sum()
+        return loss
+
+    transformers.loss.loss_utils.fixed_cross_entropy = fa2_fixed_cross_entropy


 def patch_llama_rms_norm():
@@ -147,7 +165,7 @@ def replace_llama_attn_with_flash_attn(

    # skip only if explicitly disabled
    if cross_entropy:
-        patch_llama_cross_entropy()
+        patch_fa_llama_cross_entropy()

    # skip only if explicitly disabled
    if rms_norm:
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -1,4 +1,5 @@
 """multipack patching for v2 of sample packing"""
+
 import importlib

 import transformers
@@ -27,71 +28,28 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
 ]


-def patch_for_multipack(model_type, model_name=None, is_remote_code=False):
-    if model_type == "gemmoe":
-        patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
-    elif model_type == "deepseek_v2":
-        patch_remote(model_name, ".configuration_deepseek", ".modeling_deepseek")
-    elif hasattr(transformers, "modeling_flash_attention_utils") and not is_remote_code:
+def patch_for_multipack(model_type, model_name=None, has_remote_code=False):
+    if has_remote_code:
+        patch_remote(model_name)
+    elif hasattr(transformers, "modeling_flash_attention_utils"):
        transformers.modeling_flash_attention_utils._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
-        if model_type == "mixtral" and is_deepspeed_zero3_enabled():
-            patch_mixtral_moe_forward_zero3()
-        return

-    # retain for legacy
-    if model_type == "mixtral":
-        transformers.models.mixtral.modeling_mixtral._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-        if is_deepspeed_zero3_enabled():
-            patch_mixtral_moe_forward_zero3()
-    elif model_type == "llama":
-        if hasattr(transformers.models.llama.modeling_llama, "_get_unpad_data"):
-            transformers.models.llama.modeling_llama._get_unpad_data = (  # pylint: disable=protected-access
-                get_unpad_data
-            )
-    elif model_type == "mistral":
-        if hasattr(transformers.models.mistral.modeling_mistral, "_get_unpad_data"):
-            transformers.models.llama.modeling_llama._get_unpad_data = (  # pylint: disable=protected-access
-                get_unpad_data
-            )
-    elif model_type == "qwen2":
-        transformers.models.qwen2.modeling_qwen2._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "qwen2_moe":
-        transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "falcon":
-        transformers.models.falcon.modeling_falcon._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "phi":
-        transformers.models.phi.modeling_phi._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "gemma":
-        transformers.models.gemma.modeling_gemma._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "gemma2":
-        transformers.models.gemma2.modeling_gemma2._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "starcoder2":
-        transformers.models.starcoder2.modeling_starcoder2._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
+    if model_type == "mixtral" and is_deepspeed_zero3_enabled():
+        patch_mixtral_moe_forward_zero3()


-def patch_remote(model_name, config_name, modeling_name):
+def patch_remote(model_name):
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    # we need to load the model here in order for modeling_* to be available
    with init_empty_weights():
        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
-    module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
+    parts = model_config.__class__.__module__.split(".")
+    parts[-1] = parts[-1].replace("configuration_", "modeling_", 1)
+    module_name = ".".join(parts)
    modeling_arch = importlib.import_module(module_name)
-    modeling_arch._get_unpad_data = get_unpad_data  # pylint: disable=protected-access
+    if hasattr(modeling_arch, "_get_unpad_data"):
+        modeling_arch._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -46,9 +46,10 @@ def reset_optimizer(
    *,
    reset_params: List[str],  # where str is the key to a torch.nn.Parameter
    optimizer_state_keys: List[str],
-    prune_ratio: float = 0.9,
+    optimizer_magnitude_pruning: float = 0.9,
 ):
-    pruning_fn = partial(magnitude_pruning_, prune_ratio=prune_ratio)
+    # pylint:disable=unused-argument
+    pruning_fn = partial(magnitude_pruning_, prune_ratio=optimizer_magnitude_pruning)
    n_zeros = 0
    n_total = 0

@@ -56,16 +57,22 @@ def reset_optimizer(
    if isinstance(optimizer, ZeroRedundancyOptimizer):
        optimizer_state = optimizer.optim.state

-    for param in reset_params:
-        param_state = optimizer_state[param]
-        if len(param_state) == 0:  # no state for this param, happens for ZeRo optimizer
-            continue
-        for key in optimizer_state_keys:
-            pruning_fn(
-                param_state[key]
-            )  # pruning fn has to be inplace to keep the same keys in the dict
-            n_total += param_state[key].numel()
-            n_zeros += torch.sum(param_state[key] == 0).item()
+    for group in optimizer.param_groups:
+        for param in group["params"]:
+            state = optimizer_state[param]
+            for key, value in state.items():
+                if key not in optimizer_state_keys:
+                    continue
+                if torch.is_tensor(value):
+                    try:
+                        pruning_fn(value)
+                        n_total += value.numel()
+                        n_zeros += torch.sum(value == 0).item()
+                    except RuntimeError as exc:
+                        if "quantile() input tensor is too large" in str(exc):
+                            pass
+                        else:
+                            raise exc

    _zeroed = n_zeros / (1e-7 + n_total) * 100
    LOG.info(f"Percent of optimizer states zeroed: {_zeroed:.2f}")
@@ -129,6 +136,9 @@ class ReLoRACallback(TrainerCallback):

            if "adam" in args.optim.lower():
                optimizer_state_keys = ["exp_avg", "exp_avg_sq"]
+                if "8bit" in args.optim.lower():
+                    optimizer_state_keys.append("state1")
+                    optimizer_state_keys.append("state2")
            else:
                raise ValueError(f"Optimizer {args.optim} not supported with ReLoRA")

@@ -160,7 +170,7 @@ class ReLoRACallback(TrainerCallback):
                    optimizer,
                    reset_params=lora_params,
                    optimizer_state_keys=optimizer_state_keys,
-                    prune_ratio=args.relora_prune_ratio,
+                    optimizer_magnitude_pruning=args.relora_prune_ratio,
                )

            if self.quantized:
--- a/src/axolotl/monkeypatch/trainer_fsdp_optim.py
+++ b/src/axolotl/monkeypatch/trainer_fsdp_optim.py
@@ -0,0 +1,80 @@
+"""
+fix for FSDP optimizer save in trainer w 4.47.0
+"""
+import inspect
+import logging
+
+from transformers import Trainer
+
+from axolotl.monkeypatch.unsloth_ import detab_code
+
+LOG = logging.getLogger("axolotl.monkeypatch.trainer_fsdp_save")
+
+ORIGINAL_TRAINER_CODE = """
+
+    delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled
+
+"""
+
+PATCHED_TRAINER_CODE = """
+
+    delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
+
+"""
+
+
+def get_training_loop_code() -> str:
+    training_loop = inspect.getsource(
+        Trainer._inner_training_loop  # pylint: disable=protected-access
+    )
+    return training_loop
+
+
+def check_training_loop_is_patchable() -> bool:
+    training_loop = get_training_loop_code()
+    training_loop, _ = detab_code(training_loop)
+    return ORIGINAL_TRAINER_CODE in training_loop
+
+
+def patch_training_loop_for_fsdp():
+    """
+    monkeypatch for fixing the training loop for fsdp with optimizer save
+    """
+
+    try:
+        training_loop = get_training_loop_code()
+    except OSError:
+        return
+    Trainer._original_inner_training_loop = (  # pylint: disable=protected-access
+        training_loop
+    )
+    training_loop, _ = detab_code(training_loop)
+    if ORIGINAL_TRAINER_CODE not in training_loop:
+        return
+
+    training_loop = training_loop.replace(ORIGINAL_TRAINER_CODE, PATCHED_TRAINER_CODE)
+    training_loop = training_loop.replace(
+        "def _inner_training_loop(",
+        "def _fixed_inner_training_loop(",
+        1,
+    )
+
+    # load imports necessary
+    import transformers.trainer
+
+    items_to_import = []
+    for item in dir(transformers.trainer):
+        if item in training_loop:
+            items_to_import.append(item)
+
+    exec(  # pylint: disable=exec-used  # nosec B102
+        "from transformers.trainer import ("
+        + ", ".join(x for x in items_to_import)
+        + ")",
+        globals(),
+    )
+    exec(training_loop, globals())  # pylint: disable=exec-used  # nosec B102
+    LOG.info("patching _inner_training_loop for fsdp optimizer save")
+    Trainer._inner_training_loop = (  # pylint: disable=protected-access
+        _fixed_inner_training_loop  # pylint: disable=undefined-variable  # noqa: F821
+    )
--- a/src/axolotl/monkeypatch/trainer_grad_accum.py
+++ b/src/axolotl/monkeypatch/trainer_grad_accum.py
@@ -0,0 +1,290 @@
+"""
+fix for FSDP gradient accumulation
+see https://github.com/huggingface/transformers/pull/35128
+"""
+import inspect
+import logging
+
+from transformers import LlamaForCausalLM, Trainer
+
+from axolotl.monkeypatch.unsloth_ import detab_code
+
+LOG = logging.getLogger("axolotl.monkeypatch.trainer_grad_accum")
+
+ORIGINAL_CONTEXT_CODE = """
+    with self.compute_loss_context_manager():
+        if self.model_accepts_loss_kwargs:
+            loss = self.compute_loss(model, inputs)
+        else:
+            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+"""
+
+PATCHED_CONTEXT_CODE = """
+    with self.compute_loss_context_manager():
+        if self.model_accepts_loss_kwargs:
+            loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
+        else:
+            loss = self.compute_loss(model, inputs)
+"""
+
+ORIGINAL_LLAMA_FCLM_CODE = """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+    loss = None
+    if labels is not None:
+        loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+"""
+
+PATCHED_LLAMA_FCLM_CODE = """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # remove num_items_in_batch otherwise self.model attempts to pass it to flash_attention
+    num_items_in_batch = kwargs.pop("num_items_in_batch", None)
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+        **kwargs,
+    )
+    hidden_states = outputs[0]
+    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+    logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+
+    loss = None
+    if labels is not None:
+        loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, num_items_in_batch=num_items_in_batch, **kwargs)
+"""
+
+
+def get_training_step_code() -> str:
+    training_step = inspect.getsource(
+        Trainer.training_step  # pylint: disable=protected-access
+    )
+    return training_step
+
+
+def check_training_step_is_patchable() -> bool:
+    training_step = get_training_step_code()
+    training_step, _ = detab_code(training_step)
+    return ORIGINAL_CONTEXT_CODE in training_step
+
+
+def patch_training_step_for_ga():
+    """
+    monkeypatch for fixing the training loop for gradient accumulation
+    """
+
+    try:
+        training_step = get_training_step_code()
+    except OSError:
+        return
+    Trainer._original_training_step = training_step  # pylint: disable=protected-access
+    training_step, _ = detab_code(training_step)
+    if ORIGINAL_CONTEXT_CODE not in training_step:
+        return
+    # assert (
+    #     ORIGINAL_CONTEXT_CODE in training_step
+    # ), "Original training_step code not found"
+
+    training_step = training_step.replace(ORIGINAL_CONTEXT_CODE, PATCHED_CONTEXT_CODE)
+    training_step = training_step.replace(
+        "def training_step(",
+        "def _fixed_training_step(",
+        1,
+    )
+
+    # load imports necessary
+    import transformers.trainer
+
+    items_to_import = []
+    for item in dir(transformers.trainer):
+        if item in training_step:
+            items_to_import.append(item)
+
+    exec(  # pylint: disable=exec-used  # nosec B102
+        "from transformers.trainer import ("
+        + ", ".join(x for x in items_to_import)
+        + ")",
+        globals(),
+    )
+    exec(training_step, globals())  # pylint: disable=exec-used  # nosec B102
+    LOG.info("patching training_step")
+    Trainer.training_step = (  # pylint: disable=protected-access
+        _fixed_training_step  # pylint: disable=undefined-variable  # noqa: F821
+    )
+
+
+def get_model_forward_code() -> str:
+    forward = inspect.getsource(
+        LlamaForCausalLM.forward  # pylint: disable=protected-access
+    )
+    return forward
+
+
+def check_forward_is_patchable() -> bool:
+    forward = get_model_forward_code()
+    forward, _ = detab_code(forward)
+    return ORIGINAL_LLAMA_FCLM_CODE in forward
+
+
+def patch_forward_for_ga():
+    """
+    monkeypatch for fixing the training loop for gradient accumulation
+    """
+
+    try:
+        forward = get_model_forward_code()
+    except OSError:
+        return
+    LlamaForCausalLM._original_forward = forward  # pylint: disable=protected-access
+    forward, _ = detab_code(forward)
+    if ORIGINAL_LLAMA_FCLM_CODE not in forward:
+        return
+    # assert ORIGINAL_LLAMA_FCLM_CODE in forward, "Original forward code not found"
+
+    forward = forward.replace(ORIGINAL_LLAMA_FCLM_CODE, PATCHED_LLAMA_FCLM_CODE)
+    forward = forward.replace(
+        "def forward(",
+        "def _fixed_forward(",
+        1,
+    )
+
+    # load imports necessary
+    import transformers.models.llama.modeling_llama
+
+    items_to_import = []
+    for item in dir(transformers.models.llama.modeling_llama):
+        if item in forward:
+            items_to_import.append(item)
+
+    exec(  # pylint: disable=exec-used  # nosec B102
+        "from transformers.models.llama.modeling_llama import ("
+        + ", ".join(x for x in items_to_import)
+        + ")",
+        globals(),
+    )
+    exec(forward, globals())  # pylint: disable=exec-used  # nosec B102
+    LOG.info("patching forward")
+    LlamaForCausalLM.forward = (  # pylint: disable=protected-access
+        _fixed_forward  # pylint: disable=undefined-variable  # noqa: F821
+    )
+
+
+ORIGINAL_TRAINER_CODE = """
+                context = (
+                    functools.partial(self.accelerator.no_sync, model=model)
+                    if i != len(batch_samples) - 1
+                    else contextlib.nullcontext
+                )
+                with context():
+                    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+"""
+
+PATCHED_TRAINER_CODE = """
+                disable_deepspeed_no_sync = (
+                        self.accelerator.distributed_type == DistributedType.DEEPSPEED
+                        # and self.accelerator.deepspeed_engine_wrapped.engine.zero_optimization_partition_gradients()
+                )
+                context = (
+                    functools.partial(self.accelerator.no_sync, model=model)
+                    if i != len(batch_samples) - 1 and not disable_deepspeed_no_sync
+                    else contextlib.nullcontext
+                )
+                with context():
+                    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
+"""
+
+
+def get_training_loop_code() -> str:
+    training_loop = inspect.getsource(
+        Trainer._inner_training_loop  # pylint: disable=protected-access
+    )
+    return training_loop
+
+
+def check_training_loop_is_patchable() -> bool:
+    training_loop = get_training_loop_code()
+    training_loop, _ = detab_code(training_loop)
+    return ORIGINAL_TRAINER_CODE in training_loop
+
+
+def patch_training_loop_for_deepspeed_0_16_x():
+    """
+    monkeypatch for fixing the training loop for deepspeed GA
+
+    see https://github.com/huggingface/transformers/pull/35157
+    """
+
+    try:
+        training_loop = get_training_loop_code()
+    except OSError:
+        return
+    Trainer._original_inner_training_loop = (  # pylint: disable=protected-access
+        training_loop
+    )
+    training_loop, _ = detab_code(training_loop)
+    if ORIGINAL_TRAINER_CODE not in training_loop:
+        return
+
+    training_loop = training_loop.replace(ORIGINAL_TRAINER_CODE, PATCHED_TRAINER_CODE)
+    training_loop = training_loop.replace(
+        "def _inner_training_loop(",
+        "def _fixed_inner_training_loop(",
+        1,
+    )
+
+    # load imports necessary
+    import transformers.trainer
+
+    items_to_import = []
+    for item in dir(transformers.trainer):
+        if item in training_loop:
+            items_to_import.append(item)
+
+    exec(  # pylint: disable=exec-used  # nosec B102
+        "from transformers.trainer import ("
+        + ", ".join(x for x in items_to_import)
+        + ")",
+        globals(),
+    )
+    exec(training_loop, globals())  # pylint: disable=exec-used  # nosec B102
+    LOG.info("patching _inner_training_loop for fsdp optimizer save")
+    Trainer._inner_training_loop = (  # pylint: disable=protected-access
+        _fixed_inner_training_loop  # pylint: disable=undefined-variable  # noqa: F821
+    )
--- a/src/axolotl/monkeypatch/unsloth_.py
+++ b/src/axolotl/monkeypatch/unsloth_.py
@@ -9,10 +9,7 @@ import torch
 from accelerate.logging import get_logger
 from peft import PeftModelForCausalLM
 from torch import nn
-from transformers.models.llama.modeling_llama import (
-    LlamaFlashAttention2,
-    LlamaForCausalLM,
-)
+from transformers.models.llama.modeling_llama import LlamaFlashAttention2

 LOG = get_logger("axolotl.monkeypatch.unsloth")

@@ -55,11 +52,6 @@ def original_apply_o(self, hidden_states):
    return attn_output


-def get_forward_code() -> str:
-    forward = inspect.getsource(LlamaForCausalLM.forward)
-    return forward
-
-
 def get_self_attn_code() -> str:
    forward = inspect.getsource(LlamaFlashAttention2.forward)
    return forward
@@ -102,12 +94,22 @@ def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:


 def detab_code(code: str) -> Tuple[str, str]:
-    spaces = re.match(r"([\s\t]{1,})", code).group(0)
-    code = re.sub(r"^" + spaces, "", code, flags=re.MULTILINE)
+    try:
+        spaces = re.match(r"([\s\t]{1,})", code).group(0)
+        code = re.sub(r"^" + spaces, "", code, flags=re.MULTILINE)
+    except AttributeError:
+        return code, ""
    return code, spaces


+self_attn_lora_patched = False  # pylint: disable=invalid-name
+
+
 def patch_self_attn_lora():
+    global self_attn_lora_patched  # pylint: disable=global-statement
+    if self_attn_lora_patched:
+        # prevent patching multiple times
+        return
    self_attn_forward = get_self_attn_code()
    LlamaFlashAttention2._original_forward = (  # pylint: disable=protected-access
        self_attn_forward
@@ -139,6 +141,7 @@ def patch_self_attn_lora():
        globals(),
    )
    exec(self_attn_forward, globals())  # pylint: disable=exec-used  # nosec B102
+    self_attn_lora_patched = True
    LOG.info("patching unsloth attn lora", main_process_only=True)
    LlamaFlashAttention2.forward = (
        unsloth_attn_forward  # pylint: disable=undefined-variable  # noqa: F821
@@ -188,7 +191,7 @@ def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM):
            for module in layer_modules
        )
        mlp_not_dora = all(
-            getattr(module, "lora_magnitude_vector", None) is None
+            len(getattr(module, "lora_magnitude_vector", []) or []) == 0
            for module in layer_modules
        )

@@ -213,7 +216,7 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
                for module in layer_modules
            )
            qkv_not_dora = all(
-                getattr(module, "lora_magnitude_vector", None) is None
+                len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                for module in layer_modules
            )

@@ -232,7 +235,7 @@ def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
                for module in layer_modules
            )
            o_not_dora = all(
-                getattr(module, "lora_magnitude_vector", None) is None
+                len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                for module in layer_modules
            )

--- a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
@@ -28,6 +28,8 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
        :return:
        """

+        max_length = self.prompter.max_length
+
        self.messages = "chosen_messages"
        # pylint: disable=duplicate-code
        prompt[self.messages] = []
@@ -39,6 +41,16 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
        prompt[self.messages].append({"role": "assistant", "content": prompt["chosen"]})
        chosen_tokenized = super().tokenize_prompt(prompt)

+        if len(chosen_tokenized["input_ids"]) > max_length:
+            LOG.warning(
+                f"Chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}",
+            )
+
+            chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
+            chosen_tokenized["attention_mask"] = chosen_tokenized["attention_mask"][
+                :max_length
+            ]
+
        self.messages = "rejected_messages"
        # pylint: disable=duplicate-code
        prompt[self.messages] = []
@@ -52,6 +64,18 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
        )
        rejected_tokenized = super().tokenize_prompt(prompt)

+        if len(rejected_tokenized["input_ids"]) > max_length:
+            LOG.warning(
+                f"Rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}",
+            )
+
+            rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][
+                :max_length
+            ]
+            rejected_tokenized["attention_mask"] = rejected_tokenized["attention_mask"][
+                :max_length
+            ]
+
        return {
            "input_ids_chosen": chosen_tokenized["input_ids"],
            "attention_mask_chosen": chosen_tokenized["attention_mask"],
@@ -80,9 +104,9 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
        "roles": ds_cfg.get("roles"),
        "drop_system_message": ds_cfg.get("drop_system_message", False),
        # we need to add one for detecting sequences with exceeding the `sequence_len` limit.
-        "max_length": cfg.sequence_len + 1
-        if not cfg.reward_model
-        else cfg.sequence_len,
+        "max_length": (
+            cfg.sequence_len + 1 if not cfg.reward_model else cfg.sequence_len
+        ),
    }

    strategy_params = {
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -42,6 +42,7 @@ class ChatTemplatePrompter(Prompter):
                "gpt": "assistant",
                "system": "system",
            }
+
        self.message_field_role = message_field_role
        self.message_field_content = message_field_content
        self.message_field_training = message_field_training
@@ -53,21 +54,9 @@ class ChatTemplatePrompter(Prompter):
        self.drop_system_message = drop_system_message

    def build_prompt(self, conversation, add_generation_prompt=False, images=None):
-        turns = [
-            {
-                "role": self.roles[t[self.message_field_role]],
-                "content": t[self.message_field_content],
-                "training": t.get(self.message_field_training, None),
-            }
-            for t in conversation
-        ]
-
-        if self.drop_system_message and turns[0]["role"] == "system":
-            turns = turns[1:]
-
        if self.processor:
            text = self.processor.apply_chat_template(
-                turns,
+                conversation,
                chat_template=self.chat_template,
                tokenize=False,
                add_generation_prompt=add_generation_prompt,
@@ -76,8 +65,6 @@ class ChatTemplatePrompter(Prompter):
                text=text,
                images=images,
                return_tensors="pt",
-                truncation=True,
-                max_length=self.max_length,
            )
            # workaround since processor works in batches instead of single examples
            for k, val in batch.items():
@@ -88,9 +75,7 @@ class ChatTemplatePrompter(Prompter):
            return batch

        return self.tokenizer.apply_chat_template(
-            turns,
-            truncation=True,
-            max_length=self.max_length,
+            conversation,
            add_generation_prompt=add_generation_prompt,
            chat_template=self.chat_template,
        )
@@ -215,7 +200,14 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        train_on_eos=None,
    ):
        super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
-        self.roles_to_train = roles_to_train if roles_to_train is not None else []
+
+        self.roles_to_train = []
+        if roles_to_train:
+            # map roles if exist in prompter.roles else use the role as is
+            self.roles_to_train = [
+                prompter.roles.get(role, role) for role in roles_to_train
+            ]
+
        self.train_on_eos = train_on_eos
        self.images = "images"

@@ -262,30 +254,28 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):

            return tokenized_prompt

-        turns = prompt[self.messages]
+        turns = self.get_conversation_thread(prompt)
        input_ids = self.prompter.build_prompt(turns)
        labels = [IGNORE_TOKEN_ID] * len(input_ids)

        last_eos_idx = -1
        for index, turn in enumerate(turns):
-            role = turn.get(self.prompter.message_field_role)
-            content = turn.get(self.prompter.message_field_content)
-            train_turn = turn.get(self.prompter.message_field_training)
-            train_detail = turn.get(self.prompter.message_field_training_detail)
+            role = turn.get("role")
+            content = turn.get("content")
+            train_turn = turn.get("training")
+            train_detail = turn.get("training_detail")

            LOG.debug(
                f"Processing turn {index}: role={role}, content={content}, train_turn={train_turn}, train_detail={train_detail}"
            )

-            should_train = (
-                train_turn
-                if train_turn is not None
-                else (
-                    bool(train_detail is not None)
-                    if train_detail is not None
-                    else self.train_on_inputs or role in self.roles_to_train
-                )
-            )
+            should_train = None
+            if train_turn is not None:
+                should_train = train_turn
+            elif train_detail is not None:
+                should_train = bool(train_detail)
+            else:
+                should_train = self.train_on_inputs or role in self.roles_to_train

            LOG.debug(f"Should train: {should_train}")

@@ -293,6 +283,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                conversation_ids=input_ids, turn=index, turn_content=turn
            )

+            if turn_start_idx == -1 or turn_end_idx == -1:
+                LOG.warning(f"Failed to find boundaries for turn {index}")
+
            LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")

            if should_train and turn_start_idx != -1 and turn_end_idx != -1:
@@ -313,7 +306,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                    labels[turn_start_idx:turn_end_idx] = input_ids[
                        turn_start_idx:turn_end_idx
                    ]
-                    LOG.debug(f"Labels set for range {turn_start_idx}:{turn_end_idx}")
+                    LOG.debug(
+                        f"Set labels for training from {turn_start_idx} to {turn_end_idx}"
+                    )

                LOG.debug(f"Labels after processing turn {index}: {labels}")

@@ -351,52 +346,73 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                return i
        return -1

-    def find_turn(self, conversation_ids, turn, turn_content):
+    def find_turn(self, conversation_ids: list[int], turn: int, turn_content: dict):
        """
        Locate the starting and ending indices of the specified turn in a conversation.
-
-        Args:
-            conversation_ids (list[int]): Token IDs representing the conversation.
-            turn (int): The turn number to locate (based on EOS tokens).
-            turn_content (str): String containing the content of the turn.
-
-        Returns:
-            tuple: (start_idx, end_idx) indices of the start and end of the turn content.
-                   Returns (-1, -1) if the turn content is not found.
        """
-        content = turn_content.get(self.prompter.message_field_content, "")
+        content = turn_content.get("content")
        content_ids = self.tokenizer.encode(content, add_special_tokens=False)

-        eos_token_id = self.tokenizer.eos_token_id
-        eos_count = 0
-        start_search_idx = 0
+        LOG.debug(f"content_ids (length {len(content_ids)}): {content_ids}")

-        # Locate the starting index after the specified number of EOS tokens
-        for i, token_id in enumerate(conversation_ids):
-            if token_id == eos_token_id:
-                eos_count += 1
-                if eos_count == turn:
-                    start_search_idx = (
-                        i + 1
-                    )  # Start searching after the specified turn's EOS token
-                    break
+        if not content_ids:
+            LOG.warning(f"Empty content for turn {turn}")
+            return -1, -1

-        # Find the start index of the content within the conversation
-        start_idx = -1
-        for i in range(start_search_idx, len(conversation_ids) - len(content_ids) + 1):
-            if conversation_ids[i : i + len(content_ids)] == content_ids:
-                start_idx = i
-                break
-
-        if start_idx != -1:
-            end_idx = start_idx + len(content_ids)
+        # For first turn, start from beginning
+        if turn == 0:
+            start_search_idx = 0
        else:
-            end_idx = -1
+            # For subsequent turns, find the previous EOS token
+            eos_token_id = self.tokenizer.eos_token_id
+            eos_count = 0
+            start_search_idx = 0

-        return start_idx, end_idx
+            for i, token_id in enumerate(conversation_ids):
+                if token_id == eos_token_id:
+                    eos_count += 1
+                    if eos_count == turn:  # Find the nth EOS token where n = turn
+                        start_search_idx = i + 1
+                        break
+
+        # we can optimize this to only search for a few tokens from start_search_idx
+        # but it would risk missing the content if it's not found within the first few tokens or
+        # if start_search_idx cannot be found above.
+        last_index = len(conversation_ids) - len(content_ids) + 1
+
+        if last_index < start_search_idx:
+            LOG.warning(
+                f"last_index to search is less than start_search_idx for turn {turn}"
+            )
+            return -1, -1
+
+        # Search for content starting from start_search_idx
+        first_elem = content_ids[0]
+        for i in range(start_search_idx, last_index):
+            # Quick check of first element before doing full comparison
+            if conversation_ids[i] == first_elem:
+                # Check if the rest of the content matches
+                if conversation_ids[i : i + len(content_ids)] == content_ids:
+                    LOG.debug(f"Found turn {turn} content at position {i}")
+                    return i, i + len(content_ids)
+
+        return -1, -1

    def get_conversation_thread(self, prompt):
-        return prompt[self.messages]
+        turns = [
+            {
+                "role": self.prompter.roles[t[self.prompter.message_field_role]],
+                "content": t[self.prompter.message_field_content],
+                "training": t.get(self.prompter.message_field_training),
+                "training_detail": t.get(self.prompter.message_field_training_detail),
+            }
+            for t in prompt[self.messages]
+        ]
+
+        if self.prompter.drop_system_message and turns[0]["role"] == "system":
+            turns = turns[1:]
+
+        return turns

    def get_images(self, prompt):
        return prompt.get(self.images, None)
--- a/src/axolotl/prompt_strategies/dpo/chatml.py
+++ b/src/axolotl/prompt_strategies/dpo/chatml.py
@@ -8,17 +8,36 @@ def argilla(
    **kwargs,
 ):  # pylint: disable=possibly-unused-variable,unused-argument
    def transform_fn(sample):
+        if "prompt" in sample.keys():
+            prompt_key = "prompt"
+        elif "input" in sample.keys():
+            prompt_key = "input"
+        elif "question" in sample.keys():
+            prompt_key = "question"
+        else:
+            prompt_key = "instruction"
+
+        if "chosen" in sample.keys():
+            chosen_key = "chosen"
+        else:
+            chosen_key = "chosen_response"
+
+        if "rejected" in sample.keys():
+            rejected_key = "rejected"
+        else:
+            rejected_key = "rejected_response"
+
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
-                f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
+                f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample[
                "prompt"
-            ] = f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
-        sample["chosen"] = f"{sample['chosen_response']}<|im_end|>"
-        sample["rejected"] = f"{sample['rejected_response']}<|im_end|>"
+            ] = f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n"
+        sample["chosen"] = f"{sample[chosen_key]}<|im_end|>"
+        sample["rejected"] = f"{sample[rejected_key]}<|im_end|>"
        return sample

    return transform_fn
--- a/src/axolotl/prompt_strategies/dpo/llama3.py
+++ b/src/axolotl/prompt_strategies/dpo/llama3.py
@@ -8,17 +8,37 @@ def argilla(
    **kwargs,
 ):  # pylint: disable=possibly-unused-variable,unused-argument
    def transform_fn(sample):
+        # pylint: disable=duplicate-code
+        if "prompt" in sample.keys():
+            prompt_key = "prompt"
+        elif "input" in sample.keys():
+            prompt_key = "input"
+        elif "question" in sample.keys():
+            prompt_key = "question"
+        else:
+            prompt_key = "instruction"
+
+        if "chosen" in sample.keys():
+            chosen_key = "chosen"
+        else:
+            chosen_key = "chosen_response"
+
+        if "rejected" in sample.keys():
+            rejected_key = "rejected"
+        else:
+            rejected_key = "rejected_response"
+
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
-                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+                f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample[
                "prompt"
-            ] = f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-        sample["chosen"] = f"{sample['chosen_response']}<|eot_id|>"
-        sample["rejected"] = f"{sample['rejected_response']}<|eot_id|>"
+            ] = f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+        sample["chosen"] = f"{sample[chosen_key]}<|eot_id|>"
+        sample["rejected"] = f"{sample[rejected_key]}<|eot_id|>"
        return sample

    return transform_fn
--- a/src/axolotl/prompt_strategies/instruct.py
+++ b/src/axolotl/prompt_strategies/instruct.py
@@ -1,33 +0,0 @@
-"""Module containing the InstructShareGPTPromptTokenizingStrategy class"""
-from typing import Any, Dict, Optional
-
-from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
-from axolotl.prompters import ShareGPTPrompterV2
-
-
-def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
-    conversation = (
-        ds_cfg["conversation"] if ds_cfg and "conversation" in ds_cfg else None
-    )
-    strategy = InstructShareGPTPromptTokenizingStrategy(
-        # pylint: disable=duplicate-code
-        ShareGPTPrompterV2(
-            conversation=conversation,
-        ),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-    return strategy
-
-
-class InstructShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
-    """
-    basic sharegpt strategy to grab conversations from the sample row
-    """
-
-    def get_conversation_thread(self, prompt):
-        return [
-            {"from": "human", "value": prompt["instruction"]},
-            {"from": "gpt", "value": prompt["output"]},
-        ]
--- a/src/axolotl/prompt_strategies/llama2_chat.py
+++ b/src/axolotl/prompt_strategies/llama2_chat.py
@@ -29,7 +29,7 @@ from dataclasses import dataclass, field
 from typing import Generator, List, Sequence

 from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import IGNORE_TOKEN_ID, SHAREGPT_ASSERTION_FAILED_ROLE
+from axolotl.prompters import ALTERNATING_ASSERTION_FAILED_ROLE, IGNORE_TOKEN_ID


@dataclass
@@ -75,7 +75,7 @@ class Llama2ChatConversation:

 class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy):
    """
-    Tokenizing strategy for ShareGPT prompts.
+    Tokenizing strategy for Llama2 prompts.
    adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py
    """

@@ -191,7 +191,7 @@ class Llama2ChatPrompter:  # pylint: disable=too-few-public-methods
        conv.messages = []  # pylint: disable=R0801
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2], SHAREGPT_ASSERTION_FAILED_ROLE
+            assert role == conv.roles[j % 2], ALTERNATING_ASSERTION_FAILED_ROLE
            if sentence["value"]:
                conv.append_message(role, sentence["value"])
        yield conv
--- a/Show More
+++ b/Show More