proof of concept for sage attention

.gitignore improvements (#349 ) [skip ci]
updated colab notebook (#2074 )
2024-11-22 14:47:19 -05:00 · 2024-11-22 11:08:54 -05:00 · 2024-11-22 10:09:10 -05:00 · 2024-11-21 14:32:41 -05:00 · 2024-11-21 13:36:51 -05:00 · 2024-11-21 13:24:52 -05:00
200 changed files with 12583 additions and 4318 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -24,27 +24,41 @@ jobs:
            python_version: "3.11"
            pytorch: 2.3.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.10"
+            pytorch: 2.4.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.5.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
-        uses: docker/metadata-action@v3
+        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-base
+          images: |
+            winglian/axolotl-base
+            axolotlai/axolotl-base
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v4
        with:
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -17,7 +17,7 @@ jobs:
        - name: Set up Quarto
          uses: quarto-dev/quarto-actions/setup@v2
        - name: Setup Python
-          uses: actions/setup-python@v3
+          uses: actions/setup-python@v5
          with:
            python-version: '3.10'
        - name: install dependencies
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
       - '**.py'
       - 'requirements.txt'
       - '.github/workflows/*.yml'
-       - "*.md"
+       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
  workflow_dispatch:

@@ -15,9 +15,9 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.0
+      - uses: pre-commit/action@v3.0.1
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,11 +4,13 @@ on:
  push:
    branches:
      - "main"
+    tags:
+      - "v*"
  workflow_dispatch:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -27,7 +29,12 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -37,7 +44,12 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl
+          images: |
+            winglian/axolotl
+            axolotlai/axolotl
+          tags: |
+            type=ref,event=branch
+            type=pep440,pattern={{version}}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
@@ -51,7 +63,7 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
+            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
@@ -65,7 +77,7 @@ jobs:

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -84,7 +96,12 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -94,20 +111,25 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-cloud
+          images: |
+            winglian/axolotl-cloud
+            axolotlai/axolotl-cloud
+          tags: |
+            type=ref,event=branch
+            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
@@ -118,7 +140,7 @@ jobs:

  build-axolotl-cloud-no-tmux:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -136,20 +158,25 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-cloud-term
+          images: |
+            winglian/axolotl-cloud-term
+            axolotlai/axolotl-cloud-term
+          tags: |
+            type=ref,event=branch
+            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud-no-tmux
          push: ${{ github.event_name != 'pull_request' }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -1,13 +1,21 @@
 name: docker-multigpu-tests-biweekly

 on:
+  pull_request:
+    paths:
+      - 'tests/e2e/multigpu/*.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday

+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -18,10 +26,17 @@ jobs:
            pytorch: 2.3.1
            axolotl_extras:
            num_gpus: 2
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.3.1
+            pytorch: 2.4.1
+            axolotl_extras:
+            num_gpus: 2
+            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -7,7 +7,7 @@ on:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -26,7 +26,12 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -36,7 +41,9 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl
+          images: |
+            winglian/axolotl
+            axolotlai/axolotl
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Set up Docker Buildx
@@ -64,7 +71,7 @@ jobs:

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -83,7 +90,12 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            axolotl_extras:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -93,7 +105,9 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-cloud
+          images: |
+            winglian/axolotl-cloud
+            axolotlai/axolotl-cloud
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Login to Docker Hub
@@ -102,7 +116,7 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -3,12 +3,24 @@ name: publish pypi
 on:
  push:
    tags:
-      - '*'
+      - 'v*'
+  workflow_dispatch:

 jobs:
+  setup_release:
+    name: Create Release
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+    steps:
+      - name: Create release
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: gh release create "$GITHUB_REF_NAME" # GITHUB_REF_NAME is the tag name in `on.push.tags` workflows
  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
+    needs: [setup_release]
    environment:
      name: pypi
      url: https://pypi.org/p/axolotl
@@ -16,10 +28,10 @@ jobs:
      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: "3.10"

@@ -27,7 +39,7 @@ jobs:
        run: |
          pip3 install wheel packaging
          pip3 install -e .
-          pip3 install -r requirements-tests.txt
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Extract tag name
        id: tag
@@ -37,9 +49,9 @@ jobs:
        run: |
          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py

-      - name: Build a binary wheel
+      - name: Build a source dist
        run: |
-          python setup.py sdist bdist_wheel
+          python setup.py sdist

      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -9,12 +9,12 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.0
+      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch

@@ -25,31 +25,37 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies

+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
+
      - name: Update requirements.txt
        run: |
          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
-          sed -i 's#^bitsandbytes.*#bitsandbytes @ git+https://github.com/bitsandbytes-foundation/bitsandbytes.git@main#' requirements.txt
+          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
+          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt

      - name: Install dependencies
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging
          pip3 install -U -e .
-          pip3 install -r requirements-tests.txt
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Run tests
        run: |
@@ -77,17 +83,17 @@ jobs:
            num_gpus: 1
            axolotl_extras: mamba-ssm
            nightly_build: "true"
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.3.1
+            pytorch: 2.4.1
            num_gpus: 1
-            axolotl_extras: mamba-ssm
+            axolotl_extras:
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -15,17 +15,22 @@ on:
       - '.github/workflows/*.yml'
  workflow_dispatch:

+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.0
+      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch

@@ -36,60 +41,147 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4

      - name: Setup Python
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies

-      - name: Install dependencies
+      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging
+          pip3 install --upgrade packaging setuptools wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
          pip3 install -U -e .
-          pip3 install -r requirements-tests.txt
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Run tests
        run: |
-          pytest --ignore=tests/e2e/ tests/
+          pytest -n8 --ignore=tests/e2e/ tests/

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+  pytest-sdist:
+    name: PyTest from Source Dist
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.4.1", "2.5.1"]
+    timeout-minutes: 20
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging setuptools wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          python3 setup.py sdist
+          pip3 install dist/axolotl*.tar.gz
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Run tests
+        run: |
+          pytest -n8 --ignore=tests/e2e/ tests/
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+  docker-e2e-tests-1st:
+    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 60
-    needs: [pre-commit, pytest]
+    timeout-minutes: 90
+    needs: [pre-commit, pytest, pytest-sdist]

    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.10"
-            pytorch: 2.3.1
-            num_gpus: 1
-            axolotl_extras: mamba-ssm
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            num_gpus: 1
-            axolotl_extras: mamba-ssm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.4.1
+            num_gpus: 1
+            axolotl_extras:
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==0.63.64 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.tests
+
+  docker-e2e-tests:
+    if: github.repository_owner == 'axolotl-ai-cloud'
+    # this job needs to be run on self-hosted GPU runners...
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 90
+    needs: [pre-commit, pytest, docker-e2e-tests-1st]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 121
+            cuda_version: 12.1.1
+            python_version: "3.10"
+            pytorch: 2.3.1
+            num_gpus: 1
+            axolotl_extras: mamba-ssm
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras:
    steps:
--- a/.gitignore
+++ b/.gitignore
@@ -182,3 +182,6 @@ submit.sh

 typings/
 out/
+
+# vim
+*.swp
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,3 +1,3 @@
 [settings]
 profile=black
-known_third_party=wandb
+known_third_party=wandb,comet_ml
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -11,6 +11,9 @@ ignore_errors = True
 [mypy-axolotl.models.mixtral.*]
 ignore_errors = True

+[mypy-axolotl.integrations.liger.models.*]
+ignore_errors = True
+
 [mypy-axolotl.models.phi.*]
 ignore_errors = True

--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+include requirements.txt
+include README.md
+include LICENSE
+recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -1,8 +1,21 @@
-# Axolotl
+<p align="center">
+    <picture>
+        <source media="(prefers-color-scheme: dark)" srcset="image/axolotl_logo_digital_white.svg">
+        <source media="(prefers-color-scheme: light)" srcset="image/axolotl_logo_digital_black.svg">
+        <img alt="Axolotl" src="image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
+    </picture>
+</p>

-![tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg)
-![tests-nightly](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg)
-![multigpu-semi-weekly tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg)
+<p align="center">
+    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
+    <img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
+</p>
+<p align="center">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
+    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
+</p>

 Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.

@@ -11,10 +24,10 @@ Features:
 - Supports fullfinetune, lora, qlora, relora, and gptq
 - Customize configurations using a simple yaml file or CLI overwrite
 - Load different dataset formats, use custom formats, or bring your own tokenized datasets
- Integrated with xformer, flash attention, rope scaling, and multipacking
+- Integrated with xformer, flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
 - Works with single GPU or multiple GPUs via FSDP or Deepspeed
 - Easily run with Docker locally or on the cloud
- Log results and optionally checkpoints to wandb or mlflow
+- Log results and optionally checkpoints to wandb, mlflow or Comet
 - And more!

 <a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
@@ -55,6 +68,7 @@ Features:
        - [FSDP + QLoRA](#fsdp--qlora)
        - [Weights \& Biases Logging](#weights--biases-logging)
        - [Special Tokens](#special-tokens)
+      - [Liger Kernel](#liger-kernel)
    - [Inference Playground](#inference-playground)
    - [Merge LORA to base](#merge-lora-to-base)
  - [Common Errors 🧰](#common-errors-)
@@ -74,7 +88,7 @@ Features:
 <td>

 <div align="center">
-  <img src="image/axolotl.png" alt="axolotl" width="160">
+  <img src="image/axolotl_symbol_digital_white.svg" alt="axolotl" width="160">
  <div>
    <p>
      <b>Axolotl provides a unified repository for fine-tuning <br />a variety of AI models with ease</b>
@@ -120,7 +134,7 @@ Features:

 Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.

-**Requirements**: Python >=3.10 and Pytorch >=2.1.1.
+**Requirements**: Nvidia GPU (Ampere architecture or newer for `bf16` and Flash Attention), Python >=3.10 and PyTorch >=2.3.1.

 ```bash
 git clone https://github.com/axolotl-ai-cloud/axolotl
@@ -158,7 +172,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
 #### Docker

  ```bash
-  docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
+  docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
  ```

  Or run on the current files for development:
@@ -177,7 +191,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl
  A more powerful Docker command to run would be this:

  ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-latest
  ```

  It additionally:
@@ -209,7 +223,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --

 #### Cloud GPU

-For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)
+For cloud GPU providers that support docker images, use [`axolotlai/axolotl-cloud:main-latest`](https://hub.docker.com/r/axolotlai/axolotl-cloud/tags)

 - on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
 - on JarvisLabs.ai use this [direct link](https://jarvislabs.ai/templates/axolotl)
@@ -318,7 +332,7 @@ Write a job description in YAML as below:
 # dstack.yaml
 type: task

-image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2
+image: axolotlai/axolotl-cloud:main-latest

 env:
  - HUGGING_FACE_HUB_TOKEN
@@ -382,11 +396,10 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
        - typescript
      type: ... # unimplemented custom format

-      # fastchat conversation
-      # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+      # chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template
    - path: ...
-      type: sharegpt
-      conversation: chatml # default: vicuna_v1.1
+      type: chat_template
+      chat_template: chatml # defaults to tokenizer's chat_template

      # local
    - path: data.jsonl # or json
@@ -514,6 +527,22 @@ wandb_name:
 wandb_log_model:
 ```

+##### Comet Logging
+
+Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to wandb with `comet login`.
+
+- wandb options
+```yaml
+use_comet:
+comet_api_key:
+comet_workspace:
+comet_project_name:
+comet_experiment_key:
+comet_mode:
+comet_online:
+comet_experiment_config:
+```
+
 ##### Special Tokens

 It is important to have special tokens like delimiters, end-of-sequence, beginning-of-sequence in your tokenizer's vocabulary.  This will help you avoid tokenization issues and help your model train better.  You can do this in axolotl like this:
@@ -530,6 +559,26 @@ tokens: # these are delimiters

 When you include these tokens in your axolotl config, axolotl adds these tokens to the tokenizer's vocabulary.

+##### Liger Kernel
+
+Liger Kernel: Efficient Triton Kernels for LLM Training
+
+https://github.com/linkedin/Liger-Kernel
+
+Liger (LinkedIn GPU Efficient Runtime) Kernel is a collection of Triton kernels designed specifically for LLM training.
+It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The Liger Kernel
+composes well and is compatible with both FSDP and Deepspeed.
+
+```yaml
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+```
+
 ### Inference Playground

 Axolotl allows you to load your model in an interactive terminal playground for quick experimentation.
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -37,6 +37,7 @@ website:
            - docs/mac.qmd
            - docs/multi-node.qmd
            - docs/unsloth.qmd
+            - docs/amd_hpc.qmd
        - section: "Dataset Formats"
          contents: docs/dataset-formats/*
        - section: "Reference"
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,4 +1,4 @@
-FROM winglian/axolotl-base:{{ BASE_TAG }}
+FROM axolotlai/axolotl-base:{{ BASE_TAG }}

 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
@@ -23,12 +23,12 @@ RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

 # If AXOLOTL_EXTRAS is set, append it in brackets
-RUN pip install causal_conv1d
 RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^bitsandbytes.*#bitsandbytes @ git+https://github.com/bitsandbytes-foundation/bitsandbytes.git@main#' requirements.txt; \
+        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
+        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
@@ -38,7 +38,7 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
    fi

 # So we can test the Docker image
-RUN pip install -r requirements-tests.txt
+RUN pip install -r requirements-dev.txt -r requirements-tests.txt

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 set -e

-pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/
-pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ /workspace/axolotl/tests/e2e/
+pytest -n8 --ignore=tests/e2e/ /workspace/axolotl/tests/
+pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/
+pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import Image, Stub
+from modal import App, Image

 cicd_path = pathlib.Path(__file__).parent.resolve()

@@ -46,7 +46,7 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )

-stub = Stub("Axolotl CI/CD", secrets=[])
+app = App("Axolotl CI/CD", secrets=[])


 N_GPUS = int(os.environ.get("N_GPUS", 2))
@@ -61,10 +61,10 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit


-@stub.function(
+@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=45 * 60,
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072 * N_GPUS,
 )
@@ -72,6 +72,6 @@ def cicd_pytest():
    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")


-@stub.local_entrypoint()
+@app.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,4 +2,4 @@
 set -e

 # only run one test at a time so as not to OOM the GPU
-pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
+pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import Image, Stub
+from modal import App, Image

 cicd_path = pathlib.Path(__file__).parent.resolve()

@@ -47,7 +47,7 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )

-stub = Stub("Axolotl CI/CD", secrets=[])
+app = App("Axolotl CI/CD", secrets=[])


 N_GPUS = int(os.environ.get("N_GPUS", 1))
@@ -62,10 +62,10 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit


-@stub.function(
+@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=45 * 60,
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
 )
@@ -73,6 +73,6 @@ def cicd_pytest():
    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")


-@stub.local_entrypoint()
+@app.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -14,15 +14,6 @@
  "bf16": {
    "enabled": true
  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
@@ -24,15 +24,6 @@
  "bf16": {
    "enabled": true
  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
@@ -20,15 +20,6 @@
  "bf16": {
    "enabled": true
  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -1,4 +1,4 @@
-# Example config for debugging the sharegpt prompt format
+# Example config for debugging the chat_template prompt format
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
@@ -7,8 +7,8 @@ load_in_8bit: true
 load_in_4bit: false

 datasets:
-  - path: philschmid/guanaco-sharegpt-style
-    type: sharegpt
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
    shards: 10
 val_set_size: 0
 output_dir: temp_debug/axolotl_outputs/model
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main-base
-FROM winglian/axolotl-base:$BASE_TAG
+FROM axolotlai/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
@@ -20,7 +20,6 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
 WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
-RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main
-FROM winglian/axolotl:$BASE_TAG
+FROM axolotlai/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main
-FROM winglian/axolotl:$BASE_TAG
+FROM axolotlai/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -1,5 +1,5 @@
 ARG BASE_TAG=main-base
-FROM winglian/axolotl-base:$BASE_TAG
+FROM axolotlai/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -0,0 +1,108 @@
+---
+title: Training with AMD GPUs on HPC Systems
+description: A comprehensive guide for using Axolotl on distributed systems with AMD GPUs
+---
+
+This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.
+
+## Setup
+
+### 1. Install Python
+
+We recommend using Miniforge, a minimal conda-based Python distribution:
+
+```bash
+curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
+bash Miniforge3-$(uname)-$(uname -m).sh
+```
+
+### 2. Configure Python Environment
+Add Python to your PATH and ensure it's available at login:
+
+```bash
+echo 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc
+echo 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile
+```
+
+### 3. Load AMD GPU Software
+
+Load the ROCm module:
+
+```bash
+module load rocm/5.7.1
+```
+
+Note: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.
+
+### 4. Install PyTorch
+
+Install PyTorch with ROCm support:
+
+```bash
+pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall
+```
+
+### 5. Install Flash Attention
+
+Clone and install the Flash Attention repository:
+
+```bash
+git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git
+export GPU_ARCHS="gfx90a"
+cd flash-attention
+export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
+patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
+pip install .
+```
+
+### 6. Install Axolotl
+
+Clone and install Axolotl:
+
+```bash
+git clone https://github.com/axolotl-ai-cloud/axolotl
+cd axolotl
+pip install packaging ninja
+pip install -e .
+```
+
+### 7. Apply xformers Workaround
+
+xformers appears to be incompatible with ROCm. Apply the following workarounds:
+ - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return `False` for SwiGLU availability from xformers.
+ - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the "SwiGLU" function with a pass statement.
+
+### 8. Prepare Job Submission Script
+
+Create a script for job submission using your HPC's particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include
+
+```bash
+export TRANSFORMERS_OFFLINE=1
+export HF_DATASETS_OFFLINE=1
+```
+
+### 9. Download Base Model
+
+Download a base model using the Hugging Face CLI:
+
+```bash
+huggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
+```
+
+### 10. Create Axolotl Configuration
+
+Create an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.
+
+Note: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.
+
+### 11. Preprocess Data
+
+Run preprocessing on the login node:
+
+```bash
+CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess /path/to/your/config.yaml
+```
+
+### 12. Train
+
+You are now ready to submit your previously prepared job script. 🚂
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -83,22 +83,15 @@ lora_on_cpu: true
 datasets:
  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
-  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
    data_files: # Optional[str] path to source data files
    shards: # Optional[int] number of shards to split data into
    name: # Optional[str] name of dataset configuration to load
    train_on_split: train # Optional[str] name of dataset split to load from
-
-    # Optional[str] fastchat conversation type, only used with type: sharegpt
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].
+    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
+    trust_remote_code: # Optional[bool] Trust remote code for untrusted source

  # Custom user instruction prompt
  - path: repo
@@ -123,6 +116,48 @@ datasets:
      # For `completion` datsets only, uses the provided field instead of `text` column
      field:

+  # Using chat template
+  - path: ...
+    # Set type to `chat_template` to use this strategy
+    type: chat_template
+    # Specify the name of the chat template to use
+    # The name of the chat template to use for training, following values are supported:
+    # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.
+    # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
+    # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
+    # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
+    chat_template: tokenizer_default
+    # Custom jinja template for chat template. This will be only used if `chat_template` is set to `jinja` or empty (in which case chat_template is automatically set to `jinja`).
+    chat_template_jinja:
+    # The key in the data example that contains the messages. Default is "messages".
+    field_messages: messages
+    # The key in the message turn that contains the role. Default is "role".
+    message_field_role: role
+    # The key in the message turn that contains the content. Default is "content".
+    message_field_content: content
+    # Optional[Dict[str, List]]. Roles mapping for the messages.
+    roles:
+      user: ["human", "user"]
+      assistant: ["gpt", "assistant", "ai"]
+      system: ["system"]
+
+    ## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on.
+
+    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
+    roles_to_train: ["gpt", "assistant"]
+    # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
+    # - all: train on all EOS tokens
+    # - turn: train on the EOS token at the end of each trainable turn
+    # - last: train on the last EOS token in the conversation
+    train_on_eos: last
+    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
+    message_field_training: training
+    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
+    # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
+    # See example at `docs/dataset-formats/conversation.qmd`
+    message_field_training_detail: train_detail
+
+
 # If false, the datasets will not be shuffled and will keep their original order in `datasets`.
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true
@@ -140,10 +175,19 @@ test_datasets:

 # use RL training: 'dpo', 'ipo', 'kto'
 rl:
+# whether to perform weighting if doing DPO training. Boolean.
+dpo_use_weighting:

-# Saves the desired chat template to the tokenizer_config.json for easier inferencing
-# Currently supports chatml and inst (mistral/mixtral)
-chat_template: chatml
+# The name of the chat template to use for training, following values are supported:
+# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
+# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
+# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.
+# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
+# The selected chat template will be saved to the tokenizer_config.json for easier inferencing
+# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.
+chat_template: tokenizer_default
+# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
+chat_template_jinja: null
 # Changes the default system message
 default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
@@ -265,8 +309,21 @@ wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_step
 # mlflow configuration if you're using it
 mlflow_tracking_uri: # URI to mlflow
 mlflow_experiment_name: # Your experiment name
+mlflow_run_name: # Your run name
 hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry

+# Comet configuration if you're using it
+# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.
+# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start
+use_comet: # Enable or disable Comet integration.
+comet_api_key: # API key for Comet. Recommended to set via `comet login`.
+comet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.
+comet_project_name: # Project name in Comet. Defaults to Uncategorized.
+comet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.
+comet_mode: # Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration.
+comet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.
+comet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.
+
 # Where to save the full-finetuned model to
 output_dir: ./completed-model

@@ -301,7 +358,7 @@ max_steps:

 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]
+eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]

 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
@@ -349,6 +406,7 @@ lr_div_factor: # Learning rate div factor
 # - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_apex_fused
+# - adopt_adamw (only for torch version >= 2.5.1)
 # - adafactor
 # - adamw_anyprecision
 # - sgd
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -6,31 +6,8 @@ order: 3

 ## sharegpt

-conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
+IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.

-```{.json filename="data.jsonl"}
-{"conversations": [{"from": "...", "value": "..."}]}
-```
-
-Note: `type: sharegpt` opens special configs:
- `conversation`: enables conversions to many Conversation types. Refer to the 'name' [here](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) for options.
- `roles`: allows you to specify the roles for input and output. This is useful for datasets with custom roles such as `tool` etc to support masking.
- `field_human`: specify the key to use instead of `human` in the conversation.
- `field_model`: specify the key to use instead of `gpt` in the conversation.
-
-```yaml
-datasets:
-    path: ...
-    type: sharegpt
-
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].
-```

 ## pygmalion

@@ -38,34 +15,137 @@ datasets:
 {"conversations": [{"role": "...", "value": "..."}]}
 ```

-## sharegpt.load_role

-conversations where `role` is used instead of `from`
+## chat_template
+
+Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.

 ```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "value": "..."}]}
+{"conversations": [{"role": "...", "content": "..."}]}
 ```

-## sharegpt.load_guanaco
+See `config.qmd` for full configs and supported templates.

-conversations where `from` is `prompter` `assistant` instead of default sharegpt
+### Migrating from sharegpt
+
+Most configs can be adapted as follows:
+
+```yaml
+# old
+chat_template: chatml
+datasets:
+  - path: ...
+    type: sharegpt
+    conversation: chatml
+
+# new (if using tokenizer's chat_template)
+datasets:
+  - path: ...
+    type: chat_template
+
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
+# new (if setting a new chat_template like chatml, gemma, etc)
+chat_template: chatml
+datasets:
+  - path: ...
+    type: chat_template
+
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+```
+
+We recommend checking the below examples for other usecases.
+
+### Examples
+
+1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+```
+
+2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
+
+```yaml
+chat_template: gemma # this overwrites the tokenizer's chat_template
+datasets:
+  - path: ...
+    type: chat_template
+    roles_to_train: ["assistant"]
+```
+
+3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
+
+```yaml
+chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
+datasets:
+  - path: ...
+    type: chat_template
+    roles_to_train: ["assistant"]
+```
+
+4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+
+```yaml
+# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
+chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+
+datasets:
+  - path: ...
+    type: chat_template
+    roles_to_train: ["assistant"]
+```
+
+5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
+
+For a data sample that looks like:

 ```{.json filename="data.jsonl"}
-{"conversations": [{"from": "...", "value": "..."}]}
+{
+  "conversations": [
+    {"from": "system", "value": "You are an AI assistant.", "train": false},
+    {"from": "human", "value": "Hello", "train": false},
+    {"from": "assistant", "value": "Hello", "train": true},
+    {"from": "human", "value": "How are you?", "train": true},
+    {
+      "from": "assistant",
+      "value": "I'm doing very well, thank you!",
+      "train_detail": [
+        {"begin_offset": 0, "end_offset": 8, "train": false},
+        {"begin_offset": 9, "end_offset": 18, "train": true},
+        {"begin_offset": 19, "end_offset": 30, "train": false},
+      ],
+    },
+    {
+        "from": "human",
+        "value": "I'm doing very well, thank you!",
+        "train": true,
+    },
+    {"from": "assistant", "value": "Hi there!", "train": true}
+  ]
+}
 ```

-## sharegpt.load_ultrachat
+The configuration would look like:

-conversations where the turns field is 'messages', human is 'user' and gpt is 'assistant'.
-
-```{.json filename="data.jsonl"}
-{"messages": [{"user": "...", "assistant": "..."}]}
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+    chat_template: tokenizer_default
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+    roles_to_train: []
+    train_on_eos: turn
+    message_field_training: train
+    message_field_training_detail: train_detail
 ```

-## sharegpt_jokes
-
-creates a chat where bot is asked to tell a joke, then explain why the joke is funny
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
-```
+Tip: It is not necessary to use both `message_field_training` and `message_field_training_detail` at a time.
--- a/docs/dataset-formats/tokenized.qmd
+++ b/docs/dataset-formats/tokenized.qmd
@@ -7,7 +7,7 @@ order: 5
 - Pass an empty `type:` in your axolotl config.
 - Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
 - To indicate that a token should be ignored during training, set its corresponding label to `-100`.
- Do not add BOS/EOS. Axolotl will add them for you based on the default tokenizer for the model you're using.
+- You must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.
 - For pretraining, do not truncate/pad documents to the context window length.
 - For instruction training, documents must be truncated/padded as desired.

--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -51,12 +51,12 @@ While debugging it's helpful to simplify your test scenario as much as possible.

 ### Background

-The below example shows how to configure VSCode to debug data preprocessing of the `sharegpt` format.  This is the format used when you have the following in your axolotl config:
+The below example shows how to configure VSCode to debug data preprocessing of the `chat_template` format.  This is the format used when you have the following in your axolotl config:

 ```yaml
 datasets:
-  - path: <path to your sharegpt formatted dataset> # example on HF Hub: philschmid/guanaco-sharegpt-style
-    type: sharegpt
+  - path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
 ```

 >[!Important]
@@ -83,7 +83,7 @@ If you developing on a remote host, you can easily use VSCode to debug remotely.

 The easiest way to get started is to modify the [.vscode/launch.json](../.vscode/launch.json) file in this project.  This is just an example configuration, so you may need to modify or copy it to suit your needs.

-For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_sharegpt.yml`, you would use the below configuration[^1].  Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted.  This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
+For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml`, you would use the below configuration[^1].  Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted.  This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.

 ```jsonc
 // .vscode/launch.json
@@ -91,12 +91,12 @@ For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 acceler
    "version": "0.2.0",
    "configurations": [
        {
-            "name": "Debug axolotl prompt - sharegpt",
+            "name": "Debug axolotl prompt - chat_template",
            "type": "python",
            "module": "accelerate.commands.launch",
            "request": "launch",
            "args": [
-                "-m", "axolotl.cli.train", "dev_sharegpt.yml",
+                "-m", "axolotl.cli.train", "dev_chat_template.yml",
                // The flags below simplify debugging by overriding the axolotl config
                // with the debugging tips above.  Modify as needed.
                "--dataset_processes=1",      // limits data preprocessing to one process
@@ -185,7 +185,7 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3

 ## Debugging With Docker

-Using [official Axolotl Docker images](https://hub.docker.com/r/winglian/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.
+Using [official Axolotl Docker images](https://hub.docker.com/r/axolotlai/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.

 ### Setup

@@ -202,11 +202,11 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
 ```

 >[!Tip]
-> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/winglian/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
+> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

 You will now be in the container.  Next, perform an editable install of Axolotl:

@@ -240,6 +240,6 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3
 </div>
 <br>

-[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/sharegpt.yml`, but this is the same thing.
+[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml`, but this is the same thing.

 [^2]: Many of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit.  You can read more about these flags [here](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html).
--- a/docs/input_output.qmd
+++ b/docs/input_output.qmd
@@ -205,7 +205,7 @@ ds = load_from_disk(f'last_run_prepared/{directory[0]}/')
    hi there!.  goodbye  farewell</s>
 ```

-We can check that the right tokens are ingored by comparing the labels
+We can check that the right tokens are ignored by comparing the labels
 to each token:

 ```python
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -0,0 +1,28 @@
+# MultiModal / Vision Language Models (BETA)
+
+### Supported Models
+
+- Mllama, i.e. llama with vision models
+
+### Usage
+
+Currently multimodal support is limited and doesn't have full feature parity. To finetune a multimodal Llama w/ LoRA,
+you'll need to use the following in YAML in combination with the rest of the required hyperparams.
+
+```yaml
+base_model: alpindale/Llama-3.2-11B-Vision-Instruct
+processor_type: AutoProcessor
+skip_prepare_dataset: true
+
+chat_template: llama3_2_vision
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+    field_messages: messages
+remove_unused_columns: false
+sample_packing: false
+
+# only finetune the Language model, leave the vision model and vision tower frozen
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+```
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -11,12 +11,10 @@ standard industry baselines.

 ### Installation

-The following will install unsloth from source and downgrade xformers as unsloth is incompatible with the most up
-to date libraries.
+The following will install the correct unsloth and extras from source.

 ```bash
-pip install --no-deps "unsloth @ git+https://github.com/unslothai/unsloth.git"
-pip install --no-deps --force-reinstall xformers==0.0.26.post1
+python scripts/unsloth_install.py | sh
 ```

 ### Using unsloth w Axolotl
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -2,19 +2,15 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "metadata": {
-    "id": "AKjdG7tbTb-n"
-   },
+   "metadata": {},
   "source": [
-    "# Example notebook for running Axolotl on google colab"
+    "## Setting up"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "id": "RcbNpOgWRcii"
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
@@ -22,82 +18,76 @@
    "assert (torch.cuda.is_available()==True)"
   ]
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "h3nLav8oTRA5"
-   },
-   "source": [
-    "## Install Axolotl and dependencies"
-   ]
-  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "3c3yGAwnOIdi",
-    "outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
-    "!pip install flash-attn==\"2.5.0\"\n",
-    "!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
+    "!pip install axolotl[deepspeed]"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "id": "BW2MFr7HTjub"
-   },
+   "metadata": {},
   "source": [
-    "## Create an yaml config file"
+    "## Hugging Face login (optional)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "id": "9pkF2dSoQEUN"
-   },
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example configuration"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
   "outputs": [],
   "source": [
    "import yaml\n",
    "\n",
-    "# Your YAML string\n",
    "yaml_string = \"\"\"\n",
-    "base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
-    "model_type: LlamaForCausalLM\n",
-    "tokenizer_type: LlamaTokenizer\n",
+    "base_model: NousResearch/Meta-Llama-3.1-8B\n",
    "\n",
    "load_in_8bit: false\n",
    "load_in_4bit: true\n",
    "strict: false\n",
    "\n",
    "datasets:\n",
-    "  - path: mhenrichsen/alpaca_2k_test\n",
+    "  - path: tatsu-lab/alpaca\n",
    "    type: alpaca\n",
-    "dataset_prepared_path:\n",
+    "dataset_prepared_path: last_run_prepared\n",
    "val_set_size: 0.05\n",
-    "output_dir: ./outputs/qlora-out\n",
+    "output_dir: ./outputs/lora-out\n",
+    "\n",
+    "sequence_len: 2048\n",
+    "sample_packing: true\n",
+    "eval_sample_packing: true\n",
+    "pad_to_sequence_len: true\n",
    "\n",
    "adapter: qlora\n",
    "lora_model_dir:\n",
-    "\n",
-    "sequence_len: 4096\n",
-    "sample_packing: true\n",
-    "eval_sample_packing: false\n",
-    "pad_to_sequence_len: true\n",
-    "\n",
    "lora_r: 32\n",
    "lora_alpha: 16\n",
    "lora_dropout: 0.05\n",
-    "lora_target_modules:\n",
    "lora_target_linear: true\n",
    "lora_fan_in_fan_out:\n",
+    "lora_modules_to_save:\n",
+    "  - embed_tokens\n",
+    "  - lm_head\n",
    "\n",
    "wandb_project:\n",
    "wandb_entity:\n",
@@ -105,12 +95,12 @@
    "wandb_name:\n",
    "wandb_log_model:\n",
    "\n",
-    "gradient_accumulation_steps: 4\n",
-    "micro_batch_size: 2\n",
-    "num_epochs: 4\n",
-    "optimizer: paged_adamw_32bit\n",
+    "gradient_accumulation_steps: 2\n",
+    "micro_batch_size: 1\n",
+    "num_epochs: 1\n",
+    "optimizer: paged_adamw_8bit\n",
    "lr_scheduler: cosine\n",
-    "learning_rate: 0.0002\n",
+    "learning_rate: 2e-5\n",
    "\n",
    "train_on_inputs: false\n",
    "group_by_length: false\n",
@@ -121,13 +111,15 @@
    "gradient_checkpointing: true\n",
    "early_stopping_patience:\n",
    "resume_from_checkpoint:\n",
-    "local_rank:\n",
    "logging_steps: 1\n",
    "xformers_attention:\n",
-    "flash_attention: true\n",
+    "flash_attention: false\n",
+    "sdp_attention: true\n",
    "\n",
-    "warmup_steps: 10\n",
-    "evals_per_epoch: 4\n",
+    "warmup_steps: 1\n",
+    "max_steps: 25\n",
+    "evals_per_epoch: 1\n",
+    "eval_table_size:\n",
    "saves_per_epoch: 1\n",
    "debug:\n",
    "deepspeed:\n",
@@ -135,9 +127,10 @@
    "fsdp:\n",
    "fsdp_config:\n",
    "special_tokens:\n",
-    "\n",
+    "  pad_token: <|end_of_text|>\n",
    "\"\"\"\n",
    "\n",
+    "\n",
    "# Convert the YAML string to a Python dictionary\n",
    "yaml_dict = yaml.safe_load(yaml_string)\n",
    "\n",
@@ -146,31 +139,124 @@
    "\n",
    "# Write the YAML file\n",
    "with open(file_path, 'w') as file:\n",
-    "    yaml.dump(yaml_dict, file)\n"
+    "    yaml.dump(yaml_dict, file)"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {
-    "id": "bidoj8YLTusD"
-   },
+   "metadata": {},
   "source": [
-    "## Launch the training"
+    "Above we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\n",
+    "\n",
+    "The Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let's go through them line by line:\n",
+    "\n",
+    "*   \"base model\": String value, specifies the underlying pre-trained LLM that will be used for finetuning\n",
+    "\n",
+    "Next we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n",
+    "\n",
+    "*   \"load_in_8bit\": Boolean value, whether to quantize the model weights into 8-bit integer.\n",
+    "\n",
+    "*   \"load_in_4bit\": Boolean value, whether to quantize the model weights into 4-bit integer.\n",
+    "\n",
+    "*   \"strict\": Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n",
+    "\n",
+    "*   \"datasets\": a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n",
+    "\n",
+    "*   \"val_set_size\": Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n",
+    "\n",
+    "*   \"output_dir\": String value. Path of trained model.\n",
+    "\n",
+    "For data preprocessing:\n",
+    "\n",
+    "*   \"sequence_len\": Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n",
+    "\n",
+    "*   \"pad_to_sequence_len\": Boolean. Padding input to maximum sequence length.\n",
+    "\n",
+    "*   \"sample_packing\": Boolean. Specifies whether to use multi-packing with block diagonal attention.\n",
+    "\n",
+    "*   \"special_tokens\": Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n",
+    "\n",
+    "For LoRA configuration and its hyperparamters:\n",
+    "\n",
+    "*   \"adapter\": String. Either \"lora\" or \"qlora\", depending on user's choice.\n",
+    "\n",
+    "*   \"lora_model_dir\": String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n",
+    "\n",
+    "*   \"lora_r\": Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n",
+    "\n",
+    "*   \"lora_alpha\": Integer. Scale the weight matrices by $\\frac{\\text{lora_alpha}}{\\text{lora_r}}$Recommended to be fixed at 16.\n",
+    "\n",
+    "*   \"lora_dropout\": Float that is 1 or less. The dropout probability of a lora layer.\n",
+    "\n",
+    "*   \"lora_target_linear\": Boolean. If true, lora will target all linear modules in the transformers architecture.\n",
+    "\n",
+    "*   \"lora_modules_to_save\": If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n",
+    "\n",
+    "See [LoRA](https://arxiv.org/abs/2106.09685) for detailed explanation of LoRA implementation.\n",
+    "\n",
+    "For the training configurations:\n",
+    "\n",
+    "*   \"gradient_accumulation_steps\": Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n",
+    "\n",
+    "*   \"micro_batch_size\": Integer. Batch size per gpu / gradient_accumulation_steps\n",
+    "\n",
+    "*   \"num_epochs\": Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n",
+    "\n",
+    "*   \"optimizer\": The optimizer to use for the training.\n",
+    "\n",
+    "*   \"learning_rate\": The learning rate.\n",
+    "\n",
+    "*   \"lr_scheduler\": The learning rate scheduler to use for adjusting learning rate during training.\n",
+    "\n",
+    "*   \"train_on_inputs\": Boolean. Whether to ignore or include the user's prompt from the training labels.\n",
+    "\n",
+    "*   \"group_by_length\": Boolean. Whether to group similarly sized data to minimize padding.\n",
+    "\n",
+    "*   \"bf16\": Either \"auto\", \"true\", or \"false\". Whether to use CUDA bf16 floating point format. If set to \"auto\", will automatically apply bf16 should the gpu supports it.\n",
+    "\n",
+    "*   \"fp16\": Optional. Specifies whether to use CUDA fp16. Automatically set to true if \"bf16\" is set to true. Otherwise false.\n",
+    "\n",
+    "*   \"tf32\": Boolean. Whether to use CUDA tf32. Will override bf16.\n",
+    "\n",
+    "*   \"gradient_checkpointing\": Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n",
+    "\n",
+    "*   \"gradient_checkpointing_kwargs\": Python Dict. Fed into the trainer.\n",
+    "\n",
+    "*   \"logging_steps\": Integer. Log training information over every specified number of steps.\n",
+    "\n",
+    "*   \"flash_attention\": Boolean. Whether to use the [flash attention](https://github.com/Dao-AILab/flash-attention) mechanism.\n",
+    "\n",
+    "*   \"sdp_attention\": Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the [original implementation](https://arxiv.org/abs/1706.03762) of transformers.)\n",
+    "\n",
+    "*   \"warmup_steps\": Integer. The number of pre-training steps where a very low learning rate is used.\n",
+    "\n",
+    "*   \"evals_per_epoch\": Integer. Number of evaluations to be performed within one training epoch.\n",
+    "\n",
+    "*   \"saves_per_epoch\": Integer. Number of times the model is saved in one training epoch.\n",
+    "\n",
+    "*   \"weight_decay\": Positive Float. Sets the \"strength\" of weight decay (i.e. setting the coefficient of L2 regularization)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see [here](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "ydTI2Jk2RStU",
-    "outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
-   },
+   "metadata": {},
   "outputs": [],
   "source": [
-    "# By using the ! the comand will be executed as a bash command\n",
    "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
   ]
  },
@@ -178,7 +264,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Play with inference"
+    "Predict with trained model"
   ]
  },
  {
@@ -187,36 +273,85 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# By using the ! the comand will be executed as a bash command\n",
    "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
-    "    --qlora_model_dir=\"./qlora-out\" --gradio"
+    "    --lora_model_dir=\"./outputs/lora-out\" --gradio"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Deeper Dive"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It is also helpful to gain some familiarity over some of the core inner workings of axolotl"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Configuration Normalization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Axolotl uses a custom Dict class, called ```DictDefault```\n",
+    "to store configurations specified in the yaml configuration file (into a Python variable named ```cfg```). The definition for this custom Dict can be found in the [utils/dict.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/dict.py)\n",
+    "\n",
+    "```DictDefault``` is amended such that calling a missing key from it will result in a ```None``` return type. This is important because if some configuration options aren't specified by the user, the ```None``` type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out [utils/config/__init__.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/__init__.py)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading Models, Tokenizers, and Trainer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we inspect [cli.train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/cli/train.py), we will find that most of the heavy lifting were done by the function ```train()``` which is itself imported from [src/axolotl/train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/train.py).\n",
+    "\n",
+    "```train()``` takes care of loading the appropriate tokenizer and pre-trained model through ```load_model()``` and ```load_tokenizer()``` from [src/axolotl/utils/models.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/models.py) respectively.\n",
+    "\n",
+    "```load_tokenizer()``` loads in the appropriate tokenizer given the desired model, as well as chat templates.\n",
+    "\n",
+    "```ModelLoader``` class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ```ModelLoader``` will utilize the corresponding \"attention hijacking\" script. For example, if the user specified the base model to be ```NousResearch/Meta-Llama-3.1-8B```, which is of llama type, and set ```flash_attn``` to ```True```, ```ModelLoader``` will load in [llama_attn_hijack_flash.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/monkeypatch/llama_attn_hijack_flash.py). For a list of supported attention hijacking, please refer to the directory [/src/axolotl/monkeypatch/](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch)\n",
+    "\n",
+    "Another important operation encompassed in ```train()``` is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of ```setup_trainer()``` from [/src/axolotl/utils/trainer.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/trainer.py), which in turn relies on modules from [/src/axolotl/core/trainer_builder.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/core/trainer_builder.py).\n",
+    "```trainer_builder.py``` provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning ('dpo', 'ipo', 'kto') )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Monkey patch\n",
+    "\n",
+    "The [Monkey patch directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch) is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
   ]
  }
 ],
 "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 4
+ "nbformat_minor": 2
 }
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -0,0 +1,67 @@
+base_model: deepseek-ai/DeepSeek-V2-Lite
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 2048
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 2
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+special_tokens:
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -0,0 +1,86 @@
+base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rms_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: true
+
+chat_template: deepseek_v2
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: qlora
+lora_r: 256
+lora_alpha: 256
+lora_target_linear: true
+peft_use_rslora: true
+
+gradient_accumulation_steps: 1
+micro_batch_size: 8
+num_epochs: 1
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 2
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+special_tokens:
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -11,8 +11,11 @@ chat_template: gemma
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
-    chat_template: gemma
    drop_system_message: true
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
 val_set_size: 0.0
 output_dir: ./outputs/out

--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -0,0 +1,63 @@
+base_model: google/gemma-2-2b
+model_type: AutoModelForSequenceClassification
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+reward_model: true
+chat_template: gemma
+datasets:
+  - path: argilla/distilabel-intel-orca-dpo-pairs
+    type: bradley_terry.chat_template
+val_set_size: 0.0
+output_dir: ./outputs/out
+remove_unused_columns: false
+
+sequence_len: 2048
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -4,11 +4,15 @@ tokenizer_type: AutoTokenizer
 load_in_4bit: true
 strict: false
 use_tensorboard: true
+chat_template: jamba
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
-    chat_template: jamba
    drop_system_message: true
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: jamba-large-fsdp-qlora-ft
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -0,0 +1,63 @@
+base_model: alpindale/Llama-3.2-11B-Vision-Instruct
+processor_type: AutoProcessor
+strict: false
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+chat_template: llama3_2_vision
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+    field_messages: messages
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 8192
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+local_rank:
+logging_steps: 1
+flash_attention: true
+eager_attention:
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -0,0 +1,80 @@
+base_model: NousResearch/Meta-Llama-3.1-8B
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_fused_linear_cross_entropy: true
+
+strict: false
+
+chat_template: llama3
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.02
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 100
+evals_per_epoch: 2
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_backward_prefetch: BACKWARD_PRE
+special_tokens:
+  pad_token: <|finetune_right_pad_id|>
+  eos_token: <|eot_id|>
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,6 +1,4 @@
-base_model: NousResearch/Meta-Llama-3-8B
-model_type: LlamaForCausalLM
-tokenizer_type: AutoTokenizer
+base_model: NousResearch/Meta-Llama-3.1-8B

 load_in_8bit: false
 load_in_4bit: false
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -11,7 +11,6 @@ rl: dpo
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
-    chat_template: llama3
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -10,7 +10,6 @@ chat_template: llama3
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
-    chat_template: llama3
    field_messages: messages
    message_field_role: role
    message_field_content: content
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -0,0 +1,77 @@
+base_model: meta-llama/Llama-3.2-1B
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|end_of_text|>"
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -0,0 +1,93 @@
+#Note that we are switching from the regular chat template to chatml.
+#If you experience problems with the special tokens, training for more epochs can help.
+#After training, merge the model before inference otherwise you might
+#face problems with the special tokens.
+
+base_model: mistralai/Mistral-7B-Instruct-v0.2
+model_type: MistralForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+chat_template: chatml
+rl: dpo
+datasets:
+  - path: olivermolenschot/alpaca_messages_dpo_test
+    type: chat_template.default
+    field_messages: conversation
+    field_chosen: chosen
+    field_rejected: rejected
+    message_field_role: role
+    message_field_content: content
+
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/dpo-qlora
+
+sequence_len: 2048
+sample_packing: false
+pad_to_sequence_len: true
+
+adapter: qlora
+lora_model_dir:
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.2
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+lora_modules_to_save:
+ - embed_tokens
+ - lm_head
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 16
+num_epochs: 6
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: false
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<|im_start|>"
+  eos_token: "<|im_end|>"
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -0,0 +1,75 @@
+base_model: microsoft/Phi-3.5-mini-instruct
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+chat_template: phi_3
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+    field_messages: messages
+    message_field_role: role
+    message_field_content: content
+    roles:
+      user:
+        - user
+      assistant:
+        - assistant
+
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./outputs/lora-out
+
+sequence_len: 4096
+sample_packing: false
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 4
+num_epochs: 2
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bfloat16: true
+bf16: true
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 4
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -0,0 +1,67 @@
+base_model: Qwen/Qwen2.5-0.5B
+
+strict: false
+
+chat_template: qwen_25
+rl: dpo
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_dpo_test
+    type: chat_template.default
+    field_messages: conversation
+    field_chosen: chosen
+    field_rejected: rejected
+    message_field_role: role
+    message_field_content: content
+    roles:
+      system:
+        - system
+      user:
+        - user
+      assistant:
+        - assistant
+
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./outputs/dpo-out
+
+sequence_len: 2048
+sample_packing: false
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
--- a/image/axolotl-badge-web-legacy.png
+++ b/image/axolotl-badge-web-legacy.png
--- a/image/axolotl-badge-web.png
+++ b/image/axolotl-badge-web.png
--- a/image/axolotl_logo_digital_black.svg
+++ b/image/axolotl_logo_digital_black.svg
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
+    <path fill="#141310" d="M435,234.3l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185.1h31.6l47.9,185.1h-24.5ZM417.7,164.9l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+    <path fill="#141310" d="M568.2,234.3l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
+    <path fill="#141310" d="M658.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM658.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#141310" d="M860.6,236.3c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM860.6,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#141310" d="M773.9,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
+    <path fill="#141310" d="M1036.2,234.3V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.8v-24.1h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
+    <path fill="#141310" d="M978.6,234.3c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3v-45.3h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+    <path fill="#141310" d="M51.5,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v32.8h20.6v-32.8c0-4.7,3.8-8.4,8.4-8.4Z"/>
+    <path fill="#141310" d="M92.8,49h12.2v-20.6h-12.2c-16,0-29,13-29,29v12.2h20.6v-12.2c0-4.7,3.8-8.4,8.4-8.4Z"/>
+    <path fill="#141310" d="M249.3,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v32.8h20.6v-32.8Z"/>
+    <path fill="#141310" d="M187.4,90.2v-20.6h-103.1v20.6h-41.2v20.6h-20.6v41.2c0,11.4,9.2,20.6,20.6,20.6h185.5c11.4,0,20.6-9.2,20.6-20.6v-41.2h-20.6v-20.6h-41.2ZM166.8,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3ZM228.7,141.7c0-5.7-4.6-10.3-10.3-10.3s-10.3,4.6-10.3,10.3v10.3h-20.6v-20.6c0-11.4,9.2-20.6,20.6-20.6s20.6,9.2,20.6,20.6v10.3Z"/>
+    <path fill="#141310" d="M208,57.4c0-16-13-29-29-29h-12.2v20.6h12.2c4.7,0,8.4,3.8,8.4,8.4v12.2h20.6v-12.2Z"/>
+    <rect fill="#141310" x="22.5" y="234.5" width="41.2" height="20.6"/>
+    <rect fill="#141310" x="84.3" y="234.5" width="164.9" height="20.6"/>
+    <rect fill="#141310" x="208" y="193.3" width="41.2" height="20.6"/>
+    <rect fill="#141310" x="22.5" y="193.3" width="164.9" height="20.6"/>
+</svg>
--- a/image/axolotl_logo_digital_white.svg
+++ b/image/axolotl_logo_digital_white.svg
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 1113 283.5">
+    <path fill="#fff" d="M462.9,234.2l-12.1-48.8h-54.4l-12.1,48.8h-24.7l48.2-185h31.6l47.9,185h-24.4ZM445.7,164.8l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+    <path fill="#fff" d="M596.1,234.2l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.5-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.3,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.1,49.3,71.6h-28.5Z"/>
+    <path fill="#fff" d="M686.4,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM686.4,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#fff" d="M888.3,236.2c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.6,14.8-41.4,9.8-9.7,23.4-14.7,40.2-14.7s30.4,4.9,40.2,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.4-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM888.3,114.1c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.8v-36.7c0-10.5-2.8-18.5-8.2-23.8-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+    <path fill="#fff" d="M801.7,234c-18,0-32.6-14.6-32.6-32.6V48.8h24.1v152.5c0,4.7,3.8,8.5,8.5,8.5h16.7v24.1h-16.7Z"/>
+    <path fill="#fff" d="M1063.8,234.2V81.4c0-4.7-3.8-8.5-8.5-8.5h-16.7v-24.1h16.7c18,0,32.6,14.6,32.6,32.6v152.8h-24.1Z"/>
+    <path fill="#fff" d="M1006.2,234.2c-18,0-32.6-14.6-32.6-32.6v-85h-20.3v-22.1h20.3v-45.2h24.1v45.2h30.2v22.1h-30.2v85c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+    <path fill="#fff" d="M160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM277.3,57.4c0-23.8-19.3-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.7,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.7-6.3-14.1-14.1-14.1h-12.2c-6.5,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.3-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.2c0,11,5.2,20.8,13.2,27.2-7.3.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.7,6.3,14.1,14.1,14.1h41.2c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h164.9c7.7,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.8-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.2c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM77.8,255.1h-41.2v-20.6h41.2v20.6ZM36.5,213.9v-20.6h164.9v20.6H36.5ZM263.3,255.1H98.4v-20.6h164.9v20.6ZM263.3,213.9h-41.2v-20.6h41.2v20.6ZM263.3,90.2h-20.6v20.6h20.6v41.2c0,11.4-9.2,20.6-20.6,20.6H57.2c-11.4,0-20.6-9.2-20.6-20.6v-41.2h20.6v-20.6h-20.6v-32.8c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.2v-20.6h-20.6v-12.2c0-16,13-29,29-29h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.1v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v12.2h-20.6v20.6h41.2v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16,0,29,13,29,29v32.8ZM201.4,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM222,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM160.2,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
+</svg>
--- a/image/axolotl_symbol_digital_black.svg
+++ b/image/axolotl_symbol_digital_black.svg
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #141310;
+      }
+    </style>
+  </defs>
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <g>
+        <path class="cls-1" d="M46.9,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v36.9h23.2v-36.9c0-5.2,4.2-9.5,9.5-9.5Z"/>
+        <path class="cls-1" d="M93.2,37.4h13.7V14.2h-13.7c-18,0-32.7,14.6-32.7,32.7v13.7h23.2v-13.7c0-5.2,4.2-9.5,9.5-9.5Z"/>
+        <path class="cls-1" d="M269.3,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v36.9h23.2v-36.9Z"/>
+        <path class="cls-1" d="M199.7,83.8v-23.2h-116v23.2h-46.4v23.2H14.2v46.4c0,12.8,10.4,23.2,23.2,23.2h208.7c12.8,0,23.2-10.4,23.2-23.2v-46.4h-23.2v-23.2h-46.4ZM176.5,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6ZM246.1,141.7c0-6.4-5.2-11.6-11.6-11.6s-11.6,5.2-11.6,11.6v11.6h-23.2v-23.2c0-12.8,10.4-23.2,23.2-23.2s23.2,10.4,23.2,23.2v11.6Z"/>
+        <path class="cls-1" d="M222.9,46.9c0-18-14.6-32.7-32.7-32.7h-13.7v23.2h13.7c5.2,0,9.5,4.2,9.5,9.5v13.7h23.2v-13.7Z"/>
+        <rect class="cls-1" x="14.2" y="246.1" width="46.4" height="23.2"/>
+        <rect class="cls-1" x="83.8" y="246.1" width="185.5" height="23.2"/>
+        <rect class="cls-1" x="222.9" y="199.7" width="46.4" height="23.2"/>
+        <rect class="cls-1" x="14.2" y="199.7" width="185.5" height="23.2"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/image/axolotl_symbol_digital_white.svg
+++ b/image/axolotl_symbol_digital_white.svg
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 283.5 283.5">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #fff;
+      }
+    </style>
+  </defs>
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <path class="cls-1" d="M152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM269.3,57.3c0-23.8-19.4-43.1-43.1-43.1h-12.2c-3.9,0-7.6,1.6-10.2,4.4-5.9-2.9-12.3-4.4-18.9-4.4h-12.2c-7.8,0-14.1,6.3-14.1,14.1v20.6c0,2.4.6,4.6,1.6,6.6h-37c1-2,1.6-4.2,1.6-6.6v-20.6c0-7.8-6.3-14.1-14.1-14.1h-12.2c-6.6,0-13,1.5-18.9,4.4-2.6-2.8-6.3-4.4-10.2-4.4h-12.2c-23.8,0-43.1,19.4-43.1,43.1v32.8c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v41.3c0,11,5.2,20.9,13.2,27.2-7.4.4-13.2,6.6-13.2,14v20.6c0,4.1,1.7,7.7,4.5,10.3-2.8,2.6-4.5,6.2-4.5,10.3v20.6c0,7.8,6.3,14.1,14.1,14.1h41.3c4.1,0,7.7-1.7,10.3-4.5,2.6,2.8,6.2,4.5,10.3,4.5h165.1c7.8,0,14.1-6.3,14.1-14.1v-20.6c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-20.6c0-7.5-5.9-13.6-13.2-14,8-6.4,13.2-16.2,13.2-27.2v-41.3c0-4.1-1.7-7.7-4.5-10.3,2.8-2.6,4.5-6.2,4.5-10.3v-32.8ZM69.5,255.2H28.2v-20.6h41.3v20.6ZM28.2,214v-20.6h165.1v20.6H28.2ZM255.2,255.2H90.1v-20.6h165.1v20.6ZM255.2,214h-41.3v-20.6h41.3v20.6ZM255.2,90.1h-20.6v20.6h20.6v41.3c0,11.4-9.2,20.6-20.6,20.6H48.9c-11.4,0-20.6-9.2-20.6-20.6v-41.3h20.6v-20.6h-20.6v-32.8c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v32.8h41.3v-20.6h-20.6v-12.2c0-16.1,13-29.1,29.1-29.1h12.2v20.6h-12.2c-4.7,0-8.4,3.8-8.4,8.4v12.2h103.2v-12.2c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v12.2h-20.6v20.6h41.3v-32.8c0-4.7-3.8-8.4-8.4-8.4h-12.2v-20.6h12.2c16.1,0,29.1,13,29.1,29.1v32.8ZM193.3,152h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6s-20.6,9.2-20.6,20.6v20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM214,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6ZM152,110.8c-11.4,0-20.6,9.2-20.6,20.6v20.6h20.6v-10.3c0-5.7,4.6-10.3,10.3-10.3s10.3,4.6,10.3,10.3v-10.3c0-11.4-9.2-20.6-20.6-20.6Z"/>
+    </g>
+  </g>
+</svg>
--- a/image/axolotl_wordmark_digital_black.svg
+++ b/image/axolotl_wordmark_digital_black.svg
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <g>
+        <path d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+        <path d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
+        <path d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
+        <path d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
+        <path d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/image/axolotl_wordmark_digital_white.svg
+++ b/image/axolotl_wordmark_digital_white.svg
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg" version="1.1" viewBox="0 0 765.4 212.6">
+  <defs>
+    <style>
+      .cls-1 {
+        fill: #fff;
+      }
+    </style>
+  </defs>
+  <!-- Generator: Adobe Illustrator 28.7.1, SVG Export Plug-In . SVG Version: 1.2.0 Build 142)  -->
+  <g>
+    <g id="Layer_1">
+      <g>
+        <path class="cls-1" d="M121.6,198.1l-12.1-48.8h-54.4l-12.1,48.8h-24.7L66.6,12.9h31.6l47.9,185.1h-24.5ZM104.4,128.6l-13.8-55.6c-2.7-10.7-4.8-19.7-6.3-26.9-.9-4.2-1.5-7.5-2-9.9-.5,2.5-1.2,5.8-2,9.9-1.5,7.1-3.6,16.1-6.3,26.7l-13.8,55.9h44.3Z"/>
+        <path class="cls-1" d="M254.9,198.1l-29.9-45.6c-1.2-1.9-2.4-4.1-3.5-6.5-.8-1.7-1.5-3.3-2.1-4.5-.6,1.3-1.4,2.8-2.3,4.5-1.3,2.4-2.6,4.6-4,6.5l-29.9,45.6h-28.5l49.6-71.9-46.5-67.9h28.5l27.6,43.1c1.2,1.9,2.3,3.9,3.4,6.1.7,1.4,1.4,2.7,1.9,3.8.5-1.1,1.1-2.4,1.8-3.8,1.1-2.2,2.2-4.2,3.4-6.1l27.6-43.1h28.5l-46.5,68.2,49.3,71.7h-28.5Z"/>
+        <path class="cls-1" d="M345.2,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM345.2,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path class="cls-1" d="M547.3,200.1c-16.7,0-30.2-5-40.1-14.8-9.9-9.8-14.9-23.7-14.9-41.3v-31.7c0-17.7,5-31.7,14.8-41.4,9.8-9.7,23.4-14.7,40.3-14.7s30.4,4.9,40.3,14.7c9.8,9.7,14.8,23.7,14.8,41.4v31.7c0,17.6-5,31.5-14.9,41.3-9.9,9.8-23.4,14.8-40.1,14.8ZM547.3,77.8c-9.5,0-17.1,2.7-22.6,8.1-5.5,5.4-8.3,13.4-8.3,23.8v36.7c0,10.5,2.8,18.5,8.3,23.8,5.5,5.4,13.1,8.1,22.6,8.1s17.3-2.7,22.7-8.1c5.4-5.4,8.2-13.4,8.2-23.9v-36.7c0-10.5-2.8-18.5-8.2-23.9-5.4-5.4-13.1-8.1-22.7-8.1Z"/>
+        <path class="cls-1" d="M460.6,197.8c-18,0-32.6-14.6-32.6-32.6V12.5h24.1v152.6c0,4.7,3.8,8.5,8.5,8.5h16.8v24.1h-16.8Z"/>
+        <path class="cls-1" d="M722.8,198.1V45.2c0-4.7-3.8-8.5-8.5-8.5h-16.8V12.5h16.8c18,0,32.6,14.6,32.6,32.6v152.9h-24.1Z"/>
+        <path class="cls-1" d="M665.2,198.1c-18,0-32.6-14.6-32.6-32.6v-85.1h-20.3v-22.1h20.3V12.9h24.1v45.3h30.2v22.1h-30.2v85.1c0,4.7,3.8,8.5,8.5,8.5h21.7v24.1h-21.7Z"/>
+      </g>
+    </g>
+  </g>
+</svg>
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -2,3 +2,4 @@ pre-commit
 black
 mypy
 types-requests
+tbparse
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,2 +1,3 @@
 pytest
 pytest-xdist
+pytest-retry
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,22 +1,22 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft==0.12.0
-transformers==4.44.0
-tokenizers>=0.19.1
-bitsandbytes==0.43.3
-accelerate==0.33.0
-datasets==2.20.0
-deepspeed==0.14.4
+peft==0.13.2
+transformers==4.46.3
+tokenizers>=0.20.1
+bitsandbytes==0.44.1
+accelerate==1.1.0
+datasets==3.1.0
+deepspeed==0.15.4
 pydantic==2.6.3
 addict
 fire
 PyYAML>=6.0
 requests
-flash-attn==2.6.3
+flash-attn==2.7.0.post2
 sentencepiece
 wandb
 einops
-xformers==0.0.27
+xformers>=0.0.23.post1
 optimum==1.16.2
 hf_transfer
 colorama
@@ -28,11 +28,12 @@ scipy
 scikit-learn==1.4.2
 pynvml
 art
-fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
 gradio==3.50.2
 tensorboard
 python-dotenv==1.0.1
-autoawq>=0.2.5
+autoawq==0.2.7.post2
+triton>=2.3.0
+liger-kernel==0.4.2

 mamba-ssm==1.2.0.post1

@@ -41,6 +42,15 @@ s3fs>=2024.5.0
 gcsfs>=2024.5.0
 # adlfs

-trl==0.9.6
+trl==0.12.0
 zstandard==0.22.0
 fastcore
+
+# lm eval harness
+lm_eval==0.4.4
+langdetect==1.0.9
+immutabledict==4.2.0
+antlr4-python3-runtime==4.13.2
+
+torchao==0.5.0
+schedulefree==1.3.0
--- a/requirements_env.txt
+++ b/requirements_env.txt
@@ -0,0 +1,315 @@
+accelerate==0.34.1
+addict==2.4.0
+aiofiles==23.2.1
+aiohttp==3.9.0
+aiosignal==1.3.1
+aiostream==0.5.2
+alembic==1.13.1
+annotated-types==0.6.0
+annoy==1.17.3
+ansible==6.7.0
+ansible-core==2.13.13
+ansible-vault==2.1.0
+anyio==3.7.1
+appdirs==1.4.4
+art==6.0
+asgiref==3.7.2
+async-timeout==4.0.2
+attrdict==2.0.1
+attrs==22.2.0
+awscli==1.32.75
+-e git+ssh://git@github.com/OpenAccess-AI-Collective/axolotl.git@6e354682e3c1735d3f7fb9e362280c38e922260f#egg=axolotl
+backoff==2.2.1
+base58==2.1.1
+beartype==0.17.2
+bitnet==0.2.1
+bitsandbytes==0.42.0
+bittensor==6.7.0
+black==23.7.0
+blinker==1.7.0
+boto3==1.34.75
+botocore==1.34.75
+cachetools==5.3.3
+cachy==0.1.1
+certifi==2023.7.22
+cffi==1.16.0
+cfgv==3.3.1
+chai-guanaco==1.2.4
+charset-normalizer==3.2.0
+cleo==0.6.8
+click==8.1.7
+cloudpickle==2.0.0
+cohere==4.11.2
+colorama==0.4.4
+coloredlogs==15.0.1
+CoLT5-attention==0.10.20
+contextlib2==21.6.0
+contourpy==1.2.0
+cryptography==41.0.3
+cycler==0.12.1
+cytoolz==0.12.3
+databricks-cli==0.18.0
+dataclasses-json==0.5.7
+datasets==2.11.0
+ddt==1.6.0
+decorator==5.1.1
+deepspeed==0.15.0
+# Editable Git install with no remote (dialogpt==0.1)
+-e /Users/wing/Projects/ml/dialogpt/src
+dill==0.3.6
+distlib==0.3.6
+docker==7.0.0
+docker-pycreds==0.4.0
+docstring-parser==0.15
+docutils==0.16
+ecdsa==0.18.0
+einops==0.7.0
+einops-exts==0.0.4
+einx==0.1.3
+entrypoints==0.4
+eth-hash==0.6.0
+eth-keys==0.5.0
+eth-typing==4.0.0
+eth-utils==2.3.1
+evaluate==0.4.0
+exceptiongroup==1.1.1
+fastapi==0.109.2
+fastcore==1.5.29
+ffmpy==0.4.0
+filelock==3.12.2
+-e git+https://github.com/NousResearch/finetuning-subnet.git@24e9407d6b4430a7ca39d344692f89ce5a97d27e#egg=finetuning_subnet
+fire==0.5.0
+first==2.0.2
+flake8==7.0.0
+Flask==3.0.1
+fonttools==4.47.2
+frozendict==2.4.1
+frozenlist==1.3.3
+fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
+fsspec==2023.6.0
+fuzzywuzzy==0.18.0
+gitdb==4.0.10
+GitPython==3.1.31
+google-pasta==0.2.0
+gradio==4.42.0
+gradio_client==1.3.0
+greenlet==2.0.2
+grpclib==0.4.7
+gunicorn==21.2.0
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub==0.23.4
+humanfriendly==10.0
+hyperframe==6.0.1
+identify==2.5.24
+idna==3.4
+immutables==0.20
+importlib-metadata==6.7.0
+importlib-resources==6.1.1
+inflection==0.5.1
+iniconfig==2.0.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.2
+jsonlines==3.1.0
+jsonschema==2.6.0
+kiwisolver==1.4.5
+langchain==0.0.144
+Levenshtein==0.24.0
+libcst==1.1.0
+liger-kernel==0.0.0
+lion-pytorch==0.1.2
+llama-cpp-python==0.1.36
+llvmlite==0.40.1
+local-attention==1.9.0
+loguru==0.7.0
+Mako==1.3.2
+Markdown==3.5.2
+markdown-it-py==3.0.0
+markdown2==2.4.10
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+matplotlib==3.8.2
+mccabe==0.7.0
+mdurl==0.1.2
+MEGABYTE-pytorch==0.0.7
+-e git+https://github.com/cg123/mergekit.git@53c5f414774a0558b8d84858fb6374bc93a8f1c1#egg=mergekit
+mlflow==2.10.0
+modal==0.62.77
+more-itertools==10.2.0
+mpmath==1.2.1
+msgpack==1.0.7
+msgpack-numpy-opentensor==0.5.0
+multidict==6.0.4
+multiprocess==0.70.14
+munch==2.5.0
+mypy==1.3.0
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+netaddr==0.10.1
+networkx==3.0rc1
+nh3==0.2.14
+nodeenv==1.8.0
+nomic==2.0.2
+numba==0.57.1
+numexpr==2.8.4
+numpy==1.24.4
+oauthlib==3.2.2
+openai==0.27.4
+openapi==1.1.0
+openapi-schema-pydantic==1.2.4
+optimum==1.8.6
+orjson==3.10.7
+packaging==23.1
+pandas==2.0.0
+parameterized==0.9.0
+password-strength==0.0.3.post2
+pastel==0.1.1
+pathos==0.3.0
+pathspec==0.11.1
+pathtools==0.1.2
+peft==0.11.1
+pendulum==3.0.0
+Pillow==9.5.0
+pip-tools==1.11.0
+platformdirs==3.2.0
+pluggy==1.4.0
+poetry==0.7.1
+pox==0.3.2
+ppft==1.7.6.6
+pre-commit==3.3.2
+prettytable==3.10.0
+prompt-toolkit==3.0.39
+protobuf==3.20.2
+protobuf3-to-dict==0.1.5
+psutil==5.9.5
+psycopg==3.1.18
+PuLP==2.8.0
+py==1.11.0
+py-bip39-bindings==0.1.11
+py-cpuinfo==9.0.0
+py-ed25519-zebra-bindings==1.0.1
+py-sr25519-bindings==0.2.0
+pyarrow==11.0.0
+pyasn1==0.6.0
+pycodestyle==2.11.1
+pycparser==2.21
+pycryptodome==3.20.0
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydub==0.25.1
+pyfiglet==0.8.post1
+pyflakes==3.2.0
+Pygments==2.15.1
+PyJWT==2.8.0
+pylev==1.4.0
+PyNaCl==1.5.0
+pynvml==11.5.0
+pyparsing==2.4.7
+pyrsistent==0.14.11
+pytest==8.0.2
+pytest-asyncio==0.23.4
+python-dateutil==2.8.2
+python-dotenv==1.0.1
+python-Levenshtein==0.24.0
+python-multipart==0.0.9
+pytz==2023.3
+PyYAML==6.0.1
+querystring-parser==1.2.4
+rapidfuzz==3.6.1
+regex==2023.6.3
+requests==2.31.0
+requests-toolbelt==0.8.0
+resolvelib==0.8.1
+responses==0.18.0
+retry==0.9.2
+rich==13.7.0
+rsa==4.7.2
+ruff==0.6.3
+s3transfer==0.10.1
+safetensors==0.4.5
+sagemaker==2.148.0
+scalecodec==1.2.7
+schedulefree==1.2.1
+schema==0.7.5
+scikit-learn==1.4.0
+scipy==1.9.3
+seaborn==0.13.2
+semantic-version==2.10.0
+sentencepiece==0.2.0
+sentry-sdk==1.19.1
+setproctitle==1.3.2
+shellingham==1.5.4
+shortuuid==1.0.11
+shtab==1.6.5
+sigtools==4.0.1
+six==1.16.0
+skypilot==0.4.1
+smdebug-rulesconfig==1.0.1
+smmap==5.0.0
+sniffio==1.3.0
+SQLAlchemy==1.4.47
+sqlparse==0.4.4
+starlette==0.36.3
+substrate-interface==1.5.2
+svgwrite==1.4.3
+sympy==1.11.1
+synchronicity==0.6.7
+tabulate==0.9.0
+tblib==1.7.0
+tenacity==8.2.2
+tensor-parallel==2.0.0
+termcolor==2.2.0
+text2art==0.2.0
+threadpoolctl==3.2.0
+tiktoken==0.6.0
+time-machine==2.14.1
+timm==0.9.16
+tokenizers==0.19.1
+tokenmonster==1.1.12
+toml==0.9.6
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.0
+torchdata==0.6.1
+torchdiffeq==0.2.3
+TorchFix==0.4.0
+torchtext==0.15.2
+torchvision==0.17.0
+tqdm==4.66.2
+transformers==4.44.2
+trl==0.9.6
+typer==0.12.5
+types-certifi==2021.10.8.3
+types-requests==2.31.0.20240125
+types-setuptools==69.0.0.20240125
+types-toml==0.10.8.7
+typing==3.7.4.3
+typing-inspect==0.8.0
+typing_extensions==4.9.0
+tyro==0.5.18
+tzdata==2023.3
+unique-names-generator==1.0.2
+urllib3==2.2.2
+uvicorn==0.22.0
+vector_quantize_pytorch==1.14.1
+virtualenv==20.23.0
+voyager==2.0.2
+wandb==0.16.2
+watchfiles==0.21.0
+wavedrom==2.0.3.post3
+wcwidth==0.2.6
+websocket-client==1.7.0
+websockets==12.0
+Werkzeug==3.0.1
+wonderwords==2.2.0
+xxhash==3.2.0
+yarl==1.8.2
+zetascale==2.2.7
+zipp==3.15.0
--- a/scripts/chat_datasets.py
+++ b/scripts/chat_datasets.py
@@ -0,0 +1,60 @@
+"""
+helper script to parse chat datasets into a usable yaml
+"""
+import click
+import yaml
+from datasets import load_dataset
+
+
+@click.command()
+@click.argument("dataset", type=str)
+@click.option("--split", type=str, default="train")
+def parse_dataset(dataset=None, split="train"):
+    ds_cfg = {}
+    ds_cfg["path"] = dataset
+    ds_cfg["split"] = split
+    ds_cfg["type"] = "chat_template"
+    ds_cfg["chat_template"] = "<<<Replace based on your model>>>"
+
+    dataset = load_dataset(dataset, split=split)
+    features = dataset.features
+    feature_keys = features.keys()
+    field_messages = None
+    for key in ["conversation", "conversations", "messages"]:
+        if key in feature_keys:
+            field_messages = key
+            break
+    if not field_messages:
+        raise ValueError(
+            f'No conversation field found in dataset: {", ".join(feature_keys)}'
+        )
+    ds_cfg["field_messages"] = field_messages
+
+    message_fields = features["conversations"][0].keys()
+    message_field_role = None
+    for key in ["from", "role"]:
+        if key in message_fields:
+            message_field_role = key
+            break
+    if not message_field_role:
+        raise ValueError(
+            f'No role field found in messages: {", ".join(message_fields)}'
+        )
+    ds_cfg["message_field_role"] = message_field_role
+
+    message_field_content = None
+    for key in ["content", "text", "value"]:
+        if key in message_fields:
+            message_field_content = key
+            break
+    if not message_field_content:
+        raise ValueError(
+            f'No content field found in messages: {", ".join(message_fields)}'
+        )
+    ds_cfg["message_field_content"] = message_field_content
+
+    print(yaml.dump({"datasets": [ds_cfg]}))
+
+
+if __name__ == "__main__":
+    parse_dataset()
--- a/scripts/cloud-entrypoint.sh
+++ b/scripts/cloud-entrypoint.sh
@@ -2,7 +2,7 @@

 # Export specific ENV variables to /etc/rp_environment
 echo "Exporting environment variables..."
-printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
+printenv | grep -E '^HF_|^BNB_|^CUDA_|^NCCL_|^NV|^RUNPOD_|^PATH=|^_=' | sed 's/^\([^=]*\)=\(.*\)$/export \1="\2"/' | grep -v 'printenv' >> /etc/rp_environment
 echo 'source /etc/rp_environment' >> ~/.bashrc

 add_keys_to_authorized() {
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -0,0 +1,33 @@
+# noqa
+# pylint: skip-file
+try:
+    import torch
+except ImportError:
+    raise ImportError("Install torch via `pip install torch`")
+from packaging.version import Version as V
+
+v = V(torch.__version__)
+cuda = str(torch.version.cuda)
+is_ampere = torch.cuda.get_device_capability()[0] >= 8
+if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
+    raise RuntimeError(f"CUDA = {cuda} not supported!")
+if v <= V("2.1.0"):
+    raise RuntimeError(f"Torch = {v} too old!")
+elif v <= V("2.1.1"):
+    x = "cu{}{}-torch211"
+elif v <= V("2.1.2"):
+    x = "cu{}{}-torch212"
+elif v < V("2.3.0"):
+    x = "cu{}{}-torch220"
+elif v < V("2.4.0"):
+    x = "cu{}{}-torch230"
+elif v < V("2.5.0"):
+    x = "cu{}{}-torch240"
+elif v < V("2.6.0"):
+    x = "cu{}{}-torch250"
+else:
+    raise RuntimeError(f"Torch = {v} too new!")
+x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
+print(
+    f'pip install unsloth-zoo && pip install --no-deps "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"'
+)
--- a/setup.py
+++ b/setup.py
@@ -30,13 +30,19 @@ def parse_requirements():

    try:
        xformers_version = [req for req in _install_requires if "xformers" in req][0]
+        torchao_version = [req for req in _install_requires if "torchao" in req][0]
+        autoawq_version = [req for req in _install_requires if "autoawq" in req][0]
+
        if "Darwin" in platform.system():
            # don't install xformers on MacOS
            _install_requires.pop(_install_requires.index(xformers_version))
        else:
            # detect the version of torch already installed
            # and set it so dependencies don't clobber the torch version
-            torch_version = version("torch")
+            try:
+                torch_version = version("torch")
+            except PackageNotFoundError:
+                torch_version = "2.5.1"
            _install_requires.append(f"torch=={torch_version}")

            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
@@ -49,20 +55,39 @@ def parse_requirements():
            else:
                raise ValueError("Invalid version format")

-            if (major, minor) >= (2, 3):
+            if (major, minor) >= (2, 5):
+                _install_requires.pop(_install_requires.index(xformers_version))
+                if patch == 0:
+                    _install_requires.append("xformers==0.0.28.post2")
+                else:
+                    _install_requires.append("xformers==0.0.28.post3")
+                _install_requires.pop(_install_requires.index(autoawq_version))
+            elif (major, minor) >= (2, 4):
+                if patch == 0:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers>=0.0.27")
+                else:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers==0.0.28.post1")
+            elif (major, minor) >= (2, 3):
+                _install_requires.pop(_install_requires.index(torchao_version))
                if patch == 0:
                    _install_requires.pop(_install_requires.index(xformers_version))
                    _install_requires.append("xformers>=0.0.26.post1")
+                else:
+                    _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers>=0.0.27")
            elif (major, minor) >= (2, 2):
+                _install_requires.pop(_install_requires.index(torchao_version))
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append("xformers>=0.0.25.post1")
            else:
+                _install_requires.pop(_install_requires.index(torchao_version))
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append("xformers>=0.0.23.post1")

    except PackageNotFoundError:
        pass
-
    return _install_requires, _dependency_links


@@ -71,26 +96,24 @@ install_requires, dependency_links = parse_requirements()

 setup(
    name="axolotl",
-    version="0.4.1",
+    version="0.5.2",
    description="LLM Trainer",
    long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
    package_dir={"": "src"},
-    packages=find_packages(),
+    packages=find_packages("src"),
    install_requires=install_requires,
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.6.3",
-        ],
-        "fused-dense-lib": [
-            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.6.2#subdirectory=csrc/fused_dense_lib",
+            "flash-attn==2.7.0.post2",
        ],
        "deepspeed": [
-            "deepspeed==0.14.4",
+            "deepspeed==0.15.4",
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
            "mamba-ssm==1.2.0.post1",
+            "causal_conv1d",
        ],
        "auto-gptq": [
            "auto-gptq==0.5.1",
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -27,8 +27,14 @@ from transformers.utils import is_torch_bf16_gpu_available
 from transformers.utils.import_utils import _is_package_available

 from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
+from axolotl.integrations.base import PluginManager
 from axolotl.logging_config import configure_logging
 from axolotl.train import TrainDatasetMeta
+from axolotl.utils.chat_templates import (
+    get_chat_template,
+    get_chat_template_from_config,
+)
+from axolotl.utils.comet_ import setup_comet_env_vars
 from axolotl.utils.config import (
    normalize_cfg_datasets,
    normalize_config,
@@ -38,7 +44,7 @@ from axolotl.utils.data import load_prepare_dpo_datasets, prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.mlflow_ import setup_mlflow_env_vars
-from axolotl.utils.models import load_tokenizer
+from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars
@@ -52,8 +58,22 @@ LOG = logging.getLogger("axolotl.scripts")

 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

+AXOLOTL_LOGO = """
+     #@@ #@@      @@# @@#
+    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
+    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
+      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
+    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@
+    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@
+     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@
+                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@
+    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@
+                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
+    @@@@  @@@@@@@@@@@@@@@@
+"""

-def print_axolotl_text_art(suffix=None):
+
+def print_legacy_axolotl_text_art(suffix=None):
    font = "nancyj"
    ascii_text = "  axolotl"
    if suffix:
@@ -66,6 +86,13 @@ def print_axolotl_text_art(suffix=None):
    print_dep_versions()


+def print_axolotl_text_art(
+    **kwargs,  # pylint: disable=unused-argument
+):
+    if is_main_process():
+        print(AXOLOTL_LOGO)
+
+
 def print_dep_versions():
    packages = ["accelerate", "peft", "transformers", "trl", "torch", "bitsandbytes"]
    max_len = max(len(pkg) for pkg in packages)
@@ -166,18 +193,19 @@ def do_inference(
 ):
    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
-    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-    for token, symbol in default_tokens.items():
-        # If the token isn't already specified in the config, add it
-        if not (cfg.special_tokens and token in cfg.special_tokens):
-            tokenizer.add_special_tokens({token: symbol})

    prompter_module = None
+    chat_template_str = None
    if prompter:
        prompter_module = getattr(
            importlib.import_module("axolotl.prompters"), prompter
        )
+    elif cfg.chat_template:
+        chat_template_str = get_chat_template(cfg.chat_template)
+    elif cfg.datasets[0].type == "chat_template":
+        chat_template_str = get_chat_template_from_config(
+            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
+        )

    model = model.to(cfg.device, dtype=cfg.torch_dtype)

@@ -187,13 +215,31 @@ def do_inference(
        instruction = get_multi_line_input()
        if not instruction:
            return
+
        if prompter_module:
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
        else:
            prompt = instruction.strip()
-        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+
+        if chat_template_str:
+            batch = tokenizer.apply_chat_template(
+                [
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+                return_tensors="pt",
+                add_special_tokens=True,
+                add_generation_prompt=True,
+                chat_template=chat_template_str,
+                tokenize=True,
+                return_dict=True,
+            )
+        else:
+            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

        print("=" * 40)
        model.eval()
@@ -233,18 +279,15 @@ def do_inference_gradio(

    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
-    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
-
-    for token, symbol in default_tokens.items():
-        # If the token isn't already specified in the config, add it
-        if not (cfg.special_tokens and token in cfg.special_tokens):
-            tokenizer.add_special_tokens({token: symbol})

    prompter_module = None
+    chat_template_str = None
    if prompter:
        prompter_module = getattr(
            importlib.import_module("axolotl.prompters"), prompter
        )
+    elif cfg.chat_template:
+        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)

    model = model.to(cfg.device, dtype=cfg.torch_dtype)

@@ -258,7 +301,24 @@ def do_inference_gradio(
            )
        else:
            prompt = instruction.strip()
-        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+
+        if chat_template_str:
+            batch = tokenizer.apply_chat_template(
+                [
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+                return_tensors="pt",
+                add_special_tokens=True,
+                add_generation_prompt=True,
+                chat_template=chat_template_str,
+                tokenize=True,
+                return_dict=True,
+            )
+        else:
+            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

        model.eval()
        with torch.no_grad():
@@ -281,6 +341,7 @@ def do_inference_gradio(
            streamer = TextIteratorStreamer(tokenizer)
            generation_kwargs = {
                "inputs": batch["input_ids"].to(cfg.device),
+                "attention_mask": batch["attention_mask"].to(cfg.device),
                "generation_config": generation_config,
                "streamer": streamer,
            }
@@ -365,6 +426,11 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):

    cfg.axolotl_config_path = config

+    if cfg.get("plugins"):
+        plugin_manager = PluginManager.get_instance()
+        for plugin_name in cfg["plugins"]:
+            plugin_manager.register(plugin_name)
+
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
@@ -392,6 +458,8 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):

    setup_mlflow_env_vars(cfg)

+    setup_comet_env_vars(cfg)
+
    return cfg


@@ -401,12 +469,20 @@ def load_datasets(
    cli_args: TrainerCliArgs,
 ) -> TrainDatasetMeta:
    tokenizer = load_tokenizer(cfg)
+    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None

    train_dataset, eval_dataset, total_num_steps, prompters = prepare_dataset(
-        cfg, tokenizer
+        cfg,
+        tokenizer,
+        processor=processor,
    )

-    if cli_args.debug or cfg.debug:
+    if (
+        cli_args.debug
+        or cfg.debug
+        or cli_args.debug_text_only
+        or int(cli_args.debug_num_examples) > 0
+    ):
        LOG.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -23,10 +23,7 @@ from axolotl.cli import (
 )
 from axolotl.common.cli import PreprocessCliArgs
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
-from axolotl.prompt_strategies.sharegpt import (
-    register_chatml_template,
-    register_llama3_template,
-)
+from axolotl.utils.trainer import disable_datasets_caching

 LOG = logging.getLogger("axolotl.cli.preprocess")

@@ -43,23 +40,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        return_remaining_strings=True
    )

-    if parsed_cfg.chat_template == "chatml":
-        if parsed_cfg.default_system_message:
-            LOG.info(
-                f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
-            )
-            register_chatml_template(parsed_cfg.default_system_message)
-        else:
-            register_chatml_template()
-    elif parsed_cfg.chat_template == "llama3":
-        if parsed_cfg.default_system_message:
-            LOG.info(
-                f"LLaMA-3 set. Adding default system message: {parsed_cfg.default_system_message}"
-            )
-            register_llama3_template(parsed_cfg.default_system_message)
-        else:
-            register_llama3_template()
-
    if not parsed_cfg.dataset_prepared_path:
        msg = (
            Fore.RED
@@ -70,10 +50,11 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        LOG.warning(msg)
        parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

-    if parsed_cfg.rl:  # and parsed_cfg.rl != "orpo":
-        load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
-    else:
-        load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    with disable_datasets_caching():
+        if parsed_cfg.rl:  # and parsed_cfg.rl != "orpo":
+            load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
+        else:
+            load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)

    if parsed_cli_args.download:
        model_name = parsed_cfg.base_model
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -3,13 +3,11 @@ CLI to run training on a model
 """
 import logging
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Union

 import fire
 from dotenv import load_dotenv
 from transformers.hf_argparser import HfArgumentParser
-from transformers.modeling_utils import PreTrainedModel
-from transformers.tokenization_utils import PreTrainedTokenizer

 from axolotl.cli import (
    check_accelerate_default_config,
@@ -20,10 +18,7 @@ from axolotl.cli import (
    print_axolotl_text_art,
 )
 from axolotl.common.cli import TrainerCliArgs
-from axolotl.prompt_strategies.sharegpt import (
-    register_chatml_template,
-    register_llama3_template,
-)
+from axolotl.integrations.base import PluginManager
 from axolotl.train import train

 LOG = logging.getLogger("axolotl.cli.train")
@@ -39,32 +34,23 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    return do_train(parsed_cfg, parsed_cli_args)


-def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+def do_train(cfg, cli_args) -> None:
    print_axolotl_text_art()
    check_accelerate_default_config()
    check_user_token()
-    if cfg.chat_template == "chatml" and cfg.default_system_message:
-        LOG.info(
-            f"ChatML set. Adding default system message: {cfg.default_system_message}"
-        )
-        register_chatml_template(cfg.default_system_message)
-    else:
-        register_chatml_template()
-
-    if cfg.chat_template == "llama3" and cfg.default_system_message:
-        LOG.info(
-            f"LLaMA-3 set. Adding default system message: {cfg.default_system_message}"
-        )
-        register_llama3_template(cfg.default_system_message)
-    else:
-        register_llama3_template()

    if cfg.rl:  # and cfg.rl != "orpo":
        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
    else:
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-    return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+    model, tokenizer = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+    plugin_manager = PluginManager.get_instance()
+
+    del model
+    del tokenizer
+
+    plugin_manager.post_train_unload(cfg)


 if __name__ == "__main__":
--- a/src/axolotl/common/cli.py
+++ b/src/axolotl/common/cli.py
@@ -23,7 +23,7 @@ class TrainerCliArgs:

    debug: bool = field(default=False)
    debug_text_only: bool = field(default=False)
-    debug_num_examples: int = field(default=5)
+    debug_num_examples: int = field(default=0)
    inference: bool = field(default=False)
    merge_lora: bool = field(default=False)
    prompter: Optional[str] = field(default=None)
--- a/src/axolotl/core/chat/init.py
+++ b/src/axolotl/core/chat/init.py
--- a/src/axolotl/core/chat/format/init.py
+++ b/src/axolotl/core/chat/format/init.py
--- a/src/axolotl/core/chat/format/chatml.py
+++ b/src/axolotl/core/chat/format/chatml.py
@@ -0,0 +1,34 @@
+"""
+ChatML transformation functions for MessageContents
+"""
+from typing import Optional
+
+from ..messages import MessageContents, Messages
+from .shared import wrap_tools
+
+
+def format_message(
+    message: Messages,
+    message_index: Optional[int] = None,  # pylint: disable=unused-argument
+) -> Messages:
+    if message.is_chat_formatted:
+        return message
+
+    # prepend the role prefix within a MessageContents to message.content
+    message.content.insert(
+        0,
+        MessageContents(
+            type="text",
+            value=f"<|im_start|>{message.role}\n",
+            weight=0,
+        ),
+    )
+    message.content.append(
+        MessageContents(type="text", value="<|im_end|>", weight=message.weight)
+    )
+    message.content.append(MessageContents(type="text", value="\n", weight=0))
+
+    message = wrap_tools(message)
+
+    message.is_chat_formatted = True
+    return message
--- a/src/axolotl/core/chat/format/llama3x.py
+++ b/src/axolotl/core/chat/format/llama3x.py
@@ -0,0 +1,45 @@
+"""
+Llama 3.x chat formatting functions for MessageContents
+"""
+from typing import Optional
+
+from ..messages import MessageContents, Messages
+from .shared import wrap_tools
+
+
+def format_message(message: Messages, message_index: Optional[int] = None) -> Messages:
+    if message.is_chat_formatted:
+        return message
+
+    message_role = message.role
+    if message.role == "tool":
+        message_role = "ipython"
+
+    # prepend the role prefix within a MessageContents to message.content
+    message.content.insert(
+        0,
+        MessageContents(
+            type="text",
+            value=f"<|start_header_id|>{message_role}<|end_header_id|>\n\n",
+            weight=0,
+        ),
+    )
+
+    message.content.append(
+        MessageContents(type="text", value="<|eot_id|>", weight=message.weight)
+    )
+
+    message = wrap_tools(message)
+
+    if message_index == 0:
+        message.content.insert(
+            0,
+            MessageContents(
+                type="text",
+                value="<|begin_of_text|>",
+                weight=0,
+            ),
+        )
+
+    message.is_chat_formatted = True
+    return message
--- a/src/axolotl/core/chat/format/shared.py
+++ b/src/axolotl/core/chat/format/shared.py
@@ -0,0 +1,47 @@
+"""
+shared functions for format transforms
+"""
+from axolotl.core.chat.messages import MessageContents, Messages
+
+
+def wrap_tools(message: Messages):
+    # loop over message.content by index to find tool calls, we need to wrap each with tags,
+    # so be wary of indexing issues when changing the list while iterating.
+    # iterate over the range in reverse order to avoid index shifting
+    for i in range(len(message.content) - 1, -1, -1):
+        if message.content[i].type == "tool_call":
+            # append a </tool_call> MessageContents text tag after
+            message.content.insert(
+                i + 1,
+                MessageContents(
+                    type="text", value="</tool_call>\n", weight=message.weight
+                ),
+            )
+            # make sure the actual tool call content ends with a newline
+            message.content[i].has_newline = True
+            # prepend a <tool_call> MessageContents text tag before
+            message.content.insert(
+                i,
+                MessageContents(
+                    type="text", value="<tool_call>\n", weight=message.weight
+                ),
+            )
+        elif message.content[i].type == "tool_response":
+            # append a </tool_call> MessageContents text tag after
+            message.content.insert(
+                i + 1,
+                MessageContents(
+                    type="text", value="</tool_response>\n", weight=message.weight
+                ),
+            )
+            # make sure the actual tool response content ends with a newline
+            message.content[i].has_newline = True
+            # prepend a <tool_call> MessageContents text tag before
+            message.content.insert(
+                i,
+                MessageContents(
+                    type="text", value="<tool_response>\n", weight=message.weight
+                ),
+            )
+
+    return message
--- a/src/axolotl/core/chat/messages.py
+++ b/src/axolotl/core/chat/messages.py
@@ -0,0 +1,230 @@
+"""
+internal message representations of chat messages
+"""
+import json
+from enum import Enum
+from typing import Any, Callable, List, Optional, Union
+
+from pydantic import BaseModel
+from transformers import PreTrainedTokenizer
+
+
+class MessageRoles(str, Enum):
+    """
+    Message roles for the system, user, assistant, and tools
+    """
+
+    system = "system"  # pylint: disable=invalid-name
+    user = "user"  # pylint: disable=invalid-name
+    assistant = "assistant"  # pylint: disable=invalid-name
+    tool = "tool"  # pylint: disable=invalid-name
+    ipython = (  # pylint: disable=invalid-name
+        # for responses from builtin tools
+        "ipython"
+    )
+
+
+class MessageContentTypes(str, Enum):
+    """
+    Message content types for text, image, audio, tool calls, and tool responses
+    """
+
+    special_token = "special_token"  # pylint: disable=invalid-name  # nosec B105
+    text = "text"  # pylint: disable=invalid-name
+    image = "image"  # pylint: disable=invalid-name
+    audio = "audio"  # pylint: disable=invalid-name
+    tool_call = "tool_call"  # pylint: disable=invalid-name  # to differentiate regular responses from tool calls from the assistant
+    tool_response = "tool_response"  # pylint: disable=invalid-name
+
+
+class SpecialToken(str, Enum):
+    """
+    Special tokens for beginning of string and end of string
+    """
+
+    bos_token = "bos_token"  # pylint: disable=invalid-name  # nosec B105
+    eos_token = "eos_token"  # pylint: disable=invalid-name  # nosec B105
+
+
+class ToolCallFunction(BaseModel):
+    """
+    Tool call function with name and arguments
+    """
+
+    name: str
+    arguments: dict[str, str]
+
+
+class Tool(BaseModel):
+    """
+    Tool with description, function, and parameters
+    """
+
+    description: str
+    function: ToolCallFunction
+    parameters: dict[str, str]  # .properties
+
+
+class ToolCallContents(BaseModel):
+    """
+    Tool call contents with name, arguments, and optional id
+    """
+
+    name: str
+    arguments: dict[str, Union[str, int]]
+    id: Optional[str] = None  # pylint: disable=invalid-name
+
+    def __str__(self) -> str:
+        data = {"name": self.name, "arguments": self.arguments}
+        if self.id is not None:
+            data["id"] = self.id
+        return json.dumps(data)
+
+
+class ToolResponseContents(BaseModel):
+    """
+    Tool response contents with name, content, and optional id
+    """
+
+    name: str
+    content: Union[str, dict[str, Union[str, int, float]]]
+    id: Optional[str] = None  # pylint: disable=invalid-name
+
+    def __str__(self) -> str:
+        data = {"name": self.name, "content": self.content}
+        if self.id is not None:
+            data["id"] = self.id
+        return json.dumps(data)
+
+
+class MessageContents(BaseModel):
+    """
+    Message contents with type, value, metadata, weight, newline, and end of contents
+    """
+
+    type: Union[str, MessageContentTypes]
+    value: Union[str, ToolCallContents, ToolResponseContents, SpecialToken]
+    meta: Optional[dict[str, Any]] = None  # support additional arbitrary metadata
+    weight: Optional[Union[int, float]] = None
+    has_newline: bool = False
+    eoc: bool = False  # end of contents
+
+    def __str__(self) -> str:
+        str_val = str(self.value)
+        if self.has_newline and not str_val.endswith("\n"):
+            str_val += "\n"
+        return str_val
+
+
+class Messages(BaseModel):
+    """
+    Messages with role, content, metadata, weight, and chat formatting
+    """
+
+    role: Union[MessageRoles, str]  # allows for arbitrary roles
+    content: List["MessageContents"]
+    meta: Optional[dict[str, Any]] = None  # support additional arbitrary metadata
+    weight: Optional[Union[int, float]] = None
+    is_chat_formatted: bool = False
+
+    def __str__(self) -> str:
+        return "".join(str(c) for c in self.content)
+
+    def tokenized(
+        self, tokenizer: PreTrainedTokenizer, ignore_index=-100
+    ) -> dict[str, List[int]]:
+        # iterate over the contents, tokenizing the concatenated string values up to the current MessageContents
+        # returns a dictionary mapping w input_ids, attention_mask, and labels
+        input_ids: List[int] = []
+        labels: List[int] = []
+        pending_input_ids: List[int] = []
+        pending_weight = self.weight
+        running_content = ""
+        for _, msg_content in enumerate(self.content):
+            # TODO also handle non-text content types
+            if msg_content.type in [
+                MessageContentTypes.text.value,
+                MessageContentTypes.tool_call.value,
+                MessageContentTypes.tool_response.value,
+            ]:
+                running_content += str(msg_content)
+                tok_results = tokenizer(running_content, add_special_tokens=False)
+                tok_input_ids = tok_results["input_ids"]
+                if pending_input_ids:
+                    new_pending_inputs = tok_input_ids[
+                        len(input_ids) : len(input_ids) + len(pending_input_ids)
+                    ]
+                    if new_pending_inputs != pending_input_ids:
+                        # logging.warning("tokenization mismatch from concatenation.")
+                        pending_input_ids = new_pending_inputs
+                    input_ids.extend(pending_input_ids)
+                    if pending_weight:
+                        labels.extend(pending_input_ids)
+                    else:
+                        labels.extend([ignore_index] * len(pending_input_ids))
+                pending_input_ids = tok_results["input_ids"][len(input_ids) :]
+                pending_weight = self.weight and msg_content.weight not in [0, 0.0]
+        input_ids.extend(pending_input_ids)
+        if pending_weight:
+            labels.extend(pending_input_ids)
+        else:
+            labels.extend([ignore_index] * len(pending_input_ids))
+        attention_mask = [1] * len(input_ids)
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+
+
+class Chats(BaseModel):
+    """
+    top level data structure for chat conversations
+    """
+
+    conversation: List[Messages]
+
+    def __str__(self) -> str:
+        return "".join(str(c) for c in self.conversation)
+
+    def tokenized(
+        self, tokenizer: Callable[[str], dict[str, List[int]]], ignore_index=-100
+    ) -> dict[str, List[int]]:
+        input_ids = []
+        attention_mask = []
+        labels = []
+        for msg in self.conversation:
+            msg_results = msg.tokenized(tokenizer, ignore_index)
+            input_ids.extend(msg_results["input_ids"])
+            attention_mask.extend(msg_results["attention_mask"])
+            labels.extend(msg_results["labels"])
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "labels": labels,
+        }
+
+
+class ChatFormattedChats(Chats):
+    """
+    Chat formatted chats with formatter and optional train on inputs
+    """
+
+    formatter: Callable  # [[Union[dict, Chats]], Chats]
+    train_on_inputs: bool = False
+
+    def model_post_init(self, __context):
+        for i, msg in enumerate(self.conversation):
+            self.conversation[i] = self.formatter(msg, message_index=i)
+            if self.train_on_inputs:
+                self.conversation[i].weight = 1
+
+
+class PreferenceChats(BaseModel):
+    """
+    representation for preference data for chat
+    """
+
+    prompt: List[Messages]
+    chosen: Messages
+    rejected: Messages
--- a/src/axolotl/core/datasets/init.py
+++ b/src/axolotl/core/datasets/init.py
--- a/src/axolotl/core/datasets/chat.py
+++ b/src/axolotl/core/datasets/chat.py
@@ -0,0 +1,55 @@
+"""
+chat dataset module
+"""
+import os
+from typing import Callable, Optional, Union
+
+from datasets import Dataset
+from transformers import PreTrainedTokenizer
+
+from axolotl.core.chat.messages import ChatFormattedChats
+
+
+class TokenizedChatDataset(Dataset):
+    """
+    Tokenized chat dataset
+    """
+
+    def __init__(
+        self,
+        data: Dataset,
+        model_transform: Union[PreTrainedTokenizer, Callable],
+        *args,
+        message_transform: Optional[Callable] = None,
+        formatter=None,
+        process_count: Optional[int] = None,
+        keep_in_memory: Optional[bool] = False,
+        **kwargs,
+    ):
+        def map_fn(ex):
+            if message_transform is not None:
+                ex = message_transform(ex)
+            if formatter is not None:
+                ex = ChatFormattedChats(
+                    formatter=formatter,
+                    **ex,
+                )
+            else:
+                ex = ChatFormattedChats(
+                    **ex,
+                )
+            return ex.tokenized(model_transform)
+
+        process_or_cpu_count: int = (
+            process_count or os.cpu_count()  # type: ignore[assignment]
+        )
+        num_proc = min(64, process_or_cpu_count)
+        features = data.features.keys()
+        tokenized_data = data.map(
+            map_fn,
+            num_proc=num_proc,
+            keep_in_memory=keep_in_memory,
+            remove_columns=features,
+            desc="Tokenizing Chats",
+        )
+        super().__init__(tokenized_data.data, *args, **kwargs)
--- a/src/axolotl/core/datasets/transforms/init.py
+++ b/src/axolotl/core/datasets/transforms/init.py
--- a/src/axolotl/core/datasets/transforms/chat_builder.py
+++ b/src/axolotl/core/datasets/transforms/chat_builder.py
@@ -0,0 +1,150 @@
+"""
+This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
+"""
+from typing import Any, Mapping, Union
+
+
+def chat_message_transform_builder(  # pylint: disable=dangerous-default-value
+    train_on_inputs=False,
+    conversations_field: str = "conversations",
+    message_field_role: Union[str, list[str]] = ["role", "from"],  # commonly "role"
+    message_field_content: Union[str, list[str]] = [
+        "value",
+        "text",
+        "content",
+    ],  # commonly "content"
+    message_field_training: Union[str, list[str]] = [
+        "train",
+        "weight",
+    ],  # commonly "weight"
+):
+    """Builds a transform that takes a row from the dataset and converts it to a Chat
+
+    Args:
+        train_on_inputs (bool, optional):
+            If True, the transform will train on the inputs. If False, the transform will train on the targets.
+            Defaults to False.
+        conversations_field (str, optional):
+            The field name of the conversations. Defaults to "conversations".
+        message_field_role (str | list[str], optional):
+            The field name of the role. Defaults to "role".
+        message_field_content (str | list[str], optional):
+            The field name of the message content. Defaults to "content".
+        message_field_training (str | list[str], optional):
+            The field name of the train/weight. Defaults to "weight".
+
+    Returns:
+        Callable:
+            A function that takes a list of conversations and returns a list of messages.
+    """
+
+    message_field_role = (
+        [message_field_role]
+        if isinstance(message_field_role, str)
+        else message_field_role
+    )
+    message_field_content = (
+        [message_field_content]
+        if isinstance(message_field_content, str)
+        else message_field_content
+    )
+    message_weight_fields = (
+        [message_field_training]
+        if isinstance(message_field_training, str)
+        else message_field_training
+    )
+
+    role_value_mappings = {
+        "system": "system",
+        "user": "user",
+        "human": "user",
+        "assistant": "assistant",
+        "gpt": "assistant",
+        "tool": "tool",
+        "ipython": "ipython",
+    }
+    if train_on_inputs:
+        role_default_weights_mappings = {
+            "system": 1,
+            "user": 1,
+            "assistant": 1,
+            "tool": 1,
+            "ipython": 1,
+        }
+    else:
+        role_default_weights_mappings = {
+            "system": 0,
+            "user": 0,
+            "assistant": 1,
+            "tool": 0,
+            "ipython": 0,
+        }
+
+    def transform_builder(sample: Mapping[str, Any]):
+        if conversations_field not in sample:
+            raise ValueError(f"Field '{conversations_field}' not found in sample.")
+        # if none of the role fields are in the message, raise an error
+        if not any(
+            role in sample[conversations_field][0] for role in message_field_role
+        ):
+            raise ValueError("No role field found in message.")
+        role_field = next(
+            role
+            for role in message_field_role
+            if role in sample[conversations_field][0]
+        )
+        if not any(
+            field in sample[conversations_field][0] for field in message_field_content
+        ):
+            raise ValueError("No message_content field found in message.")
+        message_content_field = next(
+            field
+            for field in message_field_content
+            if field in sample[conversations_field][0]
+        )
+        if not any(
+            field in sample[conversations_field][0] for field in message_field_training
+        ):
+            message_weight_field = None
+        else:
+            message_weight_field = next(
+                field
+                for field in message_weight_fields
+                if field in sample[conversations_field][0]
+            )
+
+        messages = []
+        for message in sample[conversations_field]:
+            role = role_value_mappings[message[role_field]]
+            weight = (
+                int(message[message_weight_field])
+                if message_weight_field
+                else role_default_weights_mappings[role]
+            )
+
+            # TODO if "tool_calls" in message[message_content_field]: then convert tool call to ToolCallContents
+            if isinstance(message[message_content_field], str):
+                messages.append(
+                    {
+                        "role": role,
+                        "content": [
+                            {
+                                "type": "text",
+                                "value": message[message_content_field],
+                            }
+                        ],
+                        "weight": weight,
+                    }
+                )
+            else:
+                messages.append(
+                    {
+                        "role": role,
+                        "content": message[message_content_field],
+                        "weight": weight,
+                    }
+                )
+
+        return {"conversation": messages}
+
+    return transform_builder
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -4,8 +4,10 @@ Builder for the training args and trainer
 """

 import abc
+import gc
 import importlib
 import importlib.util
+import inspect
 import logging
 import math
 import os
@@ -15,16 +17,17 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 from functools import wraps
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Type, Union
+from typing import Any, Dict, List, Literal, Optional, Type, Union

 import torch
 import transformers
 from datasets import Dataset
+from peft.optimizers import create_loraplus_optimizer
+from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from transformers import (
    EarlyStoppingCallback,
-    PreTrainedModel,
    Trainer,
    TrainerCallback,
    TrainingArguments,
@@ -40,13 +43,15 @@ from trl import (
    KTOTrainer,
    ORPOConfig,
    ORPOTrainer,
+    RewardConfig,
+    RewardTrainer,
 )
-from trl.trainer.utils import pad_to_length
+from trl.trainer.utils import RewardDataCollatorWithPadding, pad_to_length

-from axolotl.loraplus import create_loraplus_optimizer
+from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
-from axolotl.utils import is_mlflow_available
+from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
    GPUStatsCallback,
@@ -59,12 +64,14 @@ from axolotl.utils.callbacks import (
    log_prediction_callback_factory,
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
+from axolotl.utils.chat_templates import get_chat_template
 from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
    MambaDataCollator,
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
+from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
 from axolotl.utils.models import ensure_dtype
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.schedulers import (
@@ -248,6 +255,10 @@ class AxolotlTrainingMixins:
            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
        },
    )
+    chat_template: Optional[str] = field(
+        default=None,
+        metadata={"help": "Chat template converting chat messages to text"},
+    )


@dataclass
@@ -293,6 +304,13 @@ class AxolotlCPOConfig(AxolotlTrainingMixins, CPOConfig):
    )


+@dataclass
+class AxolotlRewardConfig(AxolotlTrainingMixins, RewardConfig):
+    """
+    Reward config for Reward training
+    """
+
+
 class SchedulerMixin(Trainer):
    """
    Mixin class for scheduler setup in CausalTrainer.
@@ -390,12 +408,10 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
    def __init__(
        self,
        *_args,
-        num_epochs=1,
        bench_data_collator=None,
        eval_data_collator=None,
        **kwargs,
    ):
-        self.num_epochs = num_epochs
        self.bench_data_collator = bench_data_collator
        self.eval_data_collator = eval_data_collator
        super().__init__(*_args, **kwargs)
@@ -420,7 +436,13 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        if (
            self.args.loraplus_lr_ratio is None
            and self.args.alternate_optimizer
-            not in ["optimi_adamw", "ao_adamw_8bit", "ao_adamw_4bit", "ao_adamw_fp8"]
+            not in [
+                "optimi_adamw",
+                "ao_adamw_8bit",
+                "ao_adamw_4bit",
+                "ao_adamw_fp8",
+                "adopt_adamw",
+            ]
        ):
            return super().create_optimizer()

@@ -454,14 +476,14 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
            if self.args.loraplus_lr_ratio is not None:
                loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
                loraplus_lr_embedding = getattr(
-                    self.args, "loraplus_lr_embedding", None
+                    self.args, "loraplus_lr_embedding", 1e-6
                )
                self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
                    opt_model,
                    optimizer_cls,
-                    optimizer_kwargs,
-                    loraplus_lr_ratio,
-                    loraplus_lr_embedding,
+                    loraplus_lr_ratio=loraplus_lr_ratio,
+                    loraplus_lr_embedding=loraplus_lr_embedding,
+                    **optimizer_kwargs,
                )
            elif self.args.alternate_optimizer == "optimi_adamw":
                from optimi import AdamW
@@ -489,6 +511,14 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
                    AdamWFp8(optimizer_grouped_parameters, **optimizer_kwargs)
                )
+            elif self.args.alternate_optimizer == "adopt_adamw":
+                from axolotl.utils.optimizers.adopt import ADOPT
+
+                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
+                    ADOPT(
+                        optimizer_grouped_parameters, decoupled=True, **optimizer_kwargs
+                    )
+                )

        if is_sagemaker_mp_enabled():
            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
@@ -504,9 +534,10 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
                batch_max_len = self.args.max_seq_length
            else:
                batch_size = 1
-                batch_max_len = (
-                    self.args.per_device_train_batch_size * self.args.max_seq_length
+                train_batch_size = (
+                    self.state.train_batch_size or self.args.per_device_train_batch_size
                )
+                batch_max_len = train_batch_size * self.args.max_seq_length
            return MultipackBatchSampler(
                RandomSampler(self.train_dataset),
                lengths=get_dataset_lengths(self.train_dataset),
@@ -650,7 +681,9 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        return DataLoader(bench_dataset, **dataloader_params)
        # return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params))

-    def compute_loss(self, model, inputs, return_outputs=False):
+    def compute_loss(
+        self, model, inputs, return_outputs=False, num_items_in_batch=None
+    ):
        # use one's weighted cross entropy loss calc
        # if self.args.sample_packing:
        #     labels = inputs.pop("labels")
@@ -658,8 +691,18 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        #     loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
        #     return (loss, outputs) if return_outputs else loss
        if self.args.orpo_alpha:
-            return self.orpo_compute_loss(model, inputs, return_outputs=return_outputs)
-        return super().compute_loss(model, inputs, return_outputs=return_outputs)
+            return self.orpo_compute_loss(
+                model,
+                inputs,
+                return_outputs=return_outputs,
+                num_items_in_batch=num_items_in_batch,
+            )
+        return super().compute_loss(
+            model,
+            inputs,
+            return_outputs=return_outputs,
+            num_items_in_batch=num_items_in_batch,
+        )

    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
@@ -755,7 +798,13 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        ).squeeze(2)
        return torch.mul(per_token_logps, mask).sum(dim=1) / mask.sum(dim=1)

-    def orpo_compute_loss(self, model, inputs, return_outputs=False):
+    def orpo_compute_loss(
+        self,
+        model,
+        inputs,
+        return_outputs=False,
+        num_items_in_batch=None,  # pylint: disable=unused-argument
+    ):
        concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
            inputs,
            label_pad_token=-100,
@@ -861,13 +910,13 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
        for key, value in metrics.items():
            self._stored_metrics[train_eval][key].append(value)

-    def _save_checkpoint(self, model, trial, metrics=None):
+    def _save_checkpoint(self, model, trial, **kwargs):
        # make sure the checkpoint dir exists, since trainer is flakey
        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
        run_dir = self._get_output_dir(trial=trial)
        output_dir = os.path.join(run_dir, checkpoint_folder)
        os.makedirs(output_dir, exist_ok=True)
-        return super()._save_checkpoint(model, trial, metrics=metrics)
+        return super()._save_checkpoint(model, trial, **kwargs)


 class AxolotlMambaTrainer(AxolotlTrainer):
@@ -882,6 +931,7 @@ class AxolotlMambaTrainer(AxolotlTrainer):
        model,
        inputs,
        return_outputs=False,  # pylint: disable=unused-argument
+        num_items_in_batch=None,  # pylint: disable=unused-argument
    ):
        input_ids = inputs.pop("input_ids")
        lm_logits = model(input_ids).logits
@@ -966,9 +1016,9 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
            self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
                opt_model,
                optimizer_cls,
-                optimizer_kwargs,
-                loraplus_lr_ratio,
-                loraplus_lr_embedding,
+                loraplus_lr_ratio=loraplus_lr_ratio,
+                loraplus_lr_embedding=loraplus_lr_embedding,
+                **optimizer_kwargs,
            )

        if is_sagemaker_mp_enabled():
@@ -988,15 +1038,50 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):

        return super().push_to_hub(*args, **kwargs)

+    @staticmethod
    def tokenize_row(
-        self, feature, model: Optional[Union[PreTrainedModel, torch.nn.Module]] = None
+        features,
+        processing_class,
+        max_prompt_length,
+        max_completion_length,
+        add_special_tokens,
    ) -> Dict:
-        res = super().tokenize_row(feature, model=model)
-        if self.tokenizer.bos_token_id is None and res["prompt_input_ids"][0] is None:
+        res = DPOTrainer.tokenize_row(
+            features,
+            processing_class,
+            max_prompt_length,
+            max_completion_length,
+            add_special_tokens,
+        )
+        # fix when the tokenizer doesn't have a bos_token_id, e.g. Qwen
+        if processing_class.bos_token is None and res["prompt_input_ids"][0] is None:
            for key in res.keys():
                res[key] = res[key][1:]
+
+        if processing_class.bos_token and processing_class.bos_token_id is not None:
+            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
+            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
+                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
+                res["chosen_labels"] = res["chosen_labels"][1:]
+                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
+            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
+                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
+                res["rejected_labels"] = res["rejected_labels"][1:]
+                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]
+
        return res

+    def training_step(
+        self,
+        model: nn.Module,
+        inputs: Dict[str, Union[torch.Tensor, Any]],
+        num_items_in_batch=None,
+    ) -> torch.Tensor:
+        loss: torch.Tensor = super().training_step(model, inputs, num_items_in_batch)
+        gc.collect()
+        torch.cuda.empty_cache()
+        return loss
+

 class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
    """
@@ -1022,6 +1107,14 @@ class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
    tag_names = ["axolotl", "cpo"]


+class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
+    """
+    Extend the base RewardTrainer for axolotl helpers
+    """
+
+    tag_names = ["axolotl", "reward"]
+
+
 class TrainerBuilderBase(abc.ABC):
    """
    Base class for trainer builder
@@ -1032,10 +1125,11 @@ class TrainerBuilderBase(abc.ABC):
    _model_ref = None
    _peft_config = None

-    def __init__(self, cfg, model, tokenizer):
+    def __init__(self, cfg, model, tokenizer, processor=None):
        self.cfg = cfg
        self.model = model
        self.tokenizer = tokenizer
+        self.processor = processor

        # in case the model supports tagging, add the axolotl tag.
        # This makes sure the tag is correctly pushed even if a user calls
@@ -1081,26 +1175,55 @@ class TrainerBuilderBase(abc.ABC):

    def get_callbacks(self) -> List[TrainerCallback]:
        callbacks = []
+
+        plugin_manager = PluginManager.get_instance()
+        callbacks.extend(
+            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
+        )
+
        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
            )
        if self.cfg.use_mlflow and is_mlflow_available():
+            from transformers.integrations.integration_utils import MLflowCallback
+
            from axolotl.utils.callbacks.mlflow_ import (
                SaveAxolotlConfigtoMlflowCallback,
            )

+            callbacks.extend(
+                [
+                    SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path),
+                    MLflowCallback,
+                ]
+            )
+        if self.cfg.use_comet and is_comet_available():
+            from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback
+
            callbacks.append(
-                SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
+                SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
            )

        return callbacks

-    @abstractmethod
    def get_post_trainer_create_callbacks(self, trainer):
        """
        Callbacks added after the trainer is created, usually b/c these need access to the trainer
        """
+        callbacks = []
+        if self.cfg.plugins:
+            plugin_manager = PluginManager.get_instance()
+            callbacks.extend(
+                [
+                    cb
+                    for cb in plugin_manager.add_callbacks_post_trainer(
+                        self.cfg, trainer
+                    )
+                    if cb
+                ]
+            )
+        return callbacks

    def hook_pre_create_training_args(self, training_arguments_kwargs):
        # TODO
@@ -1161,6 +1284,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                trainer, self.tokenizer, "mlflow"
            )
            callbacks.append(LogPredictionCallback(self.cfg))
+        if self.cfg.use_comet and is_comet_available() and self.cfg.eval_table_size > 0:
+            LogPredictionCallback = log_prediction_callback_factory(
+                trainer, self.tokenizer, "comet_ml"
+            )
+            callbacks.append(LogPredictionCallback(self.cfg))

        if self.cfg.do_bench_eval:
            callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer))
@@ -1178,6 +1306,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            callbacks.append(lisa_callback_factory(trainer))
+
+        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
        return callbacks

    def _get_trainer_cls(self):
@@ -1185,6 +1315,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            return ReLoRATrainer
        if self.cfg.model_config_type == "mamba":
            return AxolotlMambaTrainer
+        if self.cfg.reward_model:
+            return AxolotlRewardTrainer
        return AxolotlTrainer

    def build(self, total_num_steps):
@@ -1293,17 +1425,15 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
            # no eval set, so don't eval
-            training_arguments_kwargs["evaluation_strategy"] = "no"
+            training_arguments_kwargs["eval_strategy"] = "no"
        elif self.cfg.eval_steps:
-            training_arguments_kwargs["evaluation_strategy"] = "steps"
+            training_arguments_kwargs["eval_strategy"] = "steps"
            training_arguments_kwargs["eval_steps"] = self.cfg.eval_steps
-        elif self.cfg.evaluation_strategy:
-            training_arguments_kwargs[
-                "evaluation_strategy"
-            ] = self.cfg.evaluation_strategy
+        elif self.cfg.eval_strategy:
+            training_arguments_kwargs["eval_strategy"] = self.cfg.eval_strategy
        else:
            # we have an eval set, but no steps defined, default to use epoch
-            training_arguments_kwargs["evaluation_strategy"] = "epoch"
+            training_arguments_kwargs["eval_strategy"] = "epoch"

        if self.cfg.save_steps:
            training_arguments_kwargs["save_strategy"] = "steps"
@@ -1369,6 +1499,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs[
                "per_device_eval_batch_size"
            ] = self.cfg.eval_batch_size
+        if self.cfg.auto_find_batch_size is not None:
+            training_arguments_kwargs[
+                "auto_find_batch_size"
+            ] = self.cfg.auto_find_batch_size
        training_arguments_kwargs[
            "gradient_accumulation_steps"
        ] = self.cfg.gradient_accumulation_steps
@@ -1402,15 +1536,22 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        report_to = []
        if self.cfg.use_wandb:
            report_to.append("wandb")
+            if self.cfg.wandb_name:
+                training_arguments_kwargs["run_name"] = self.cfg.wandb_name
        if self.cfg.use_mlflow:
            report_to.append("mlflow")
        if self.cfg.use_tensorboard:
            report_to.append("tensorboard")
+        if self.cfg.use_comet:
+            report_to.append("comet_ml")

        training_arguments_kwargs["report_to"] = report_to
-        training_arguments_kwargs["run_name"] = (
-            self.cfg.wandb_name if self.cfg.use_wandb else None
-        )
+        if self.cfg.use_wandb:
+            training_arguments_kwargs["run_name"] = self.cfg.wandb_name
+        elif self.cfg.use_mlflow:
+            training_arguments_kwargs["run_name"] = self.cfg.mlflow_run_name
+        else:
+            training_arguments_kwargs["run_name"] = None
        training_arguments_kwargs["optim"] = (
            self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
        )
@@ -1451,9 +1592,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        )

        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
-        training_arguments_kwargs[
-            "multipack_real_batches"
-        ] = not self.cfg.flash_attention
+        training_arguments_kwargs["multipack_real_batches"] = (
+            not self.cfg.flash_attention or self.cfg.multipack_real_batches
+        )
        training_arguments_kwargs["eval_sample_packing"] = bool(
            self.cfg.eval_sample_packing
        )
@@ -1498,6 +1639,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        )
        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
        training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
+        if self.cfg.chat_template:
+            training_arguments_kwargs["chat_template"] = get_chat_template(
+                self.cfg.chat_template,
+                tokenizer=self.tokenizer,
+            )

        if self.cfg.rl == "orpo":
            training_arguments_kwargs["orpo_alpha"] = self.cfg.orpo_alpha
@@ -1509,11 +1655,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        trainer_kwargs = {}

+        if self.cfg.reward_model:
+            trainer_kwargs["max_length"] = self.cfg.sequence_len
+
+        # pylint: disable=duplicate-code
        if self.cfg.optimizer in [
            "optimi_adamw",
            "ao_adamw_4bit",
            "ao_adamw_8bit",
            "ao_adamw_fp8",
+            "adopt_adamw",
        ]:
            # Set default so transformers doesn't throw
            training_arguments_kwargs["optim"] = "adamw_hf"
@@ -1552,13 +1703,22 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                "accelerator_config"
            ] = self.cfg.accelerator_config

-        training_args = (
-            AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
-                **training_arguments_kwargs,
-            )
+        training_args_cls = (
+            AxolotlTrainingArguments
+            if not self.cfg.reward_model
+            else AxolotlRewardConfig
+        )
+        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
+            **training_arguments_kwargs,
        )
        training_args = self.hook_post_create_training_args(training_args)

+        # unset run_name so wandb sets up experiment names
+        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
+            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
+                None
+            )
+
        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
        }
@@ -1571,27 +1731,37 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
            data_collator_kwargs["pad_to_multiple_of"] = 64

+        if self.cfg.reward_model:
+            data_collator_kwargs["max_length"] = self.cfg.sequence_len
+
        trainer_cls = self._get_trainer_cls()
        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
            trainer_kwargs, trainer_cls
        )
+        if eval_data_collator := self.build_collator(
+            training_args, is_eval=True, **data_collator_kwargs
+        ):
+            if not self.cfg.reward_model:
+                trainer_kwargs["eval_data_collator"] = eval_data_collator
+        if not self.cfg.reward_model:
+            trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq(
+                self.tokenizer,
+                return_tensors="pt",
+                **data_collator_kwargs,
+            )
+        sig = inspect.signature(trainer_cls)
+        if "processing_class" in sig.parameters.keys():
+            trainer_kwargs["processing_class"] = self.tokenizer
+        else:
+            trainer_kwargs["tokenizer"] = self.tokenizer
+
        trainer = trainer_cls(
            model=self.model,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            args=training_args,
-            tokenizer=self.tokenizer,
            data_collator=self.build_collator(training_args, **data_collator_kwargs),
-            eval_data_collator=self.build_collator(
-                training_args, is_eval=True, **data_collator_kwargs
-            ),
-            bench_data_collator=transformers.DataCollatorForSeq2Seq(
-                self.tokenizer,
-                return_tensors="pt",
-                **data_collator_kwargs,
-            ),
            callbacks=self.get_callbacks(),
-            num_epochs=self.cfg.num_epochs,
            **trainer_kwargs,
        )
        trainer = self.hook_post_create_trainer(trainer)
@@ -1625,9 +1795,14 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                V2BatchSamplerDataCollatorForSeq2Seq,
                BatchSamplerDataCollatorForSeq2Seq,
                DataCollatorForSeq2Seq,
+                RewardDataCollatorWithPadding,
            ]
        ]
-        if use_batch_sampler_collator:
+        if self.cfg.reward_model:
+            collator = RewardDataCollatorWithPadding
+            if "max_length" in kwargs:
+                kwargs.pop("max_length")
+        elif use_batch_sampler_collator:
            if self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
                collator = V2BatchSamplerDataCollatorForSeq2Seq
            elif (
@@ -1638,7 +1813,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            else:
                collator = BatchSamplerDataCollatorForSeq2Seq
        else:
-            collator = DataCollatorForSeq2Seq
+            if self.cfg.processor_type and self.processor:
+                collator = MultiModalChatDataCollator
+                kwargs["processor"] = self.processor
+                kwargs["chat_template"] = training_args.chat_template
+            else:
+                collator = DataCollatorForSeq2Seq

        return collator(
            self.tokenizer,
@@ -1659,7 +1839,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = []
+        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
        return callbacks

    def build_training_arguments(self, total_num_steps):
@@ -1687,10 +1867,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            training_args_kwargs["save_safetensors"] = self.cfg.save_safetensors

        if self.eval_dataset:
-            training_args_kwargs["evaluation_strategy"] = "steps"
+            training_args_kwargs["eval_strategy"] = "steps"
            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
        else:
-            training_args_kwargs["evaluation_strategy"] = "no"
+            training_args_kwargs["eval_strategy"] = "no"

        if self.cfg.bf16 or self.cfg.bfloat16:
            training_args_kwargs["bf16"] = True
@@ -1745,17 +1925,18 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            # default to saving each epoch if not defined
            training_args_kwargs["save_strategy"] = "epoch"

+        training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
+
        if self.cfg.rl_beta:
            training_args_kwargs["beta"] = self.cfg.rl_beta
        if self.cfg.orpo_alpha:
            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
            training_args_kwargs["beta"] = self.cfg.orpo_alpha

-        training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-        training_args_cls = AxolotlDPOConfig
        if self.cfg.rpo_alpha is not None:
            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha

+        training_args_cls = None
        if self.cfg.rl == "simpo":
            training_args_cls = AxolotlCPOConfig
            training_args_kwargs["loss_type"] = "simpo"
@@ -1764,13 +1945,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha

-        if self.cfg.rl == "orpo":
+        elif self.cfg.rl == "orpo":
            training_args_cls = AxolotlORPOConfig
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len

-        if self.cfg.rl == "kto":
+        elif self.cfg.rl == "kto":
            training_args_cls = AxolotlKTOConfig

            training_args_kwargs["desirable_weight"] = (
@@ -1785,6 +1966,17 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len

+        else:
+            training_args_cls = AxolotlDPOConfig
+            if self.cfg.rl == "ipo":
+                training_args_kwargs["loss_type"] = "ipo"
+            training_args_kwargs["max_length"] = self.cfg.sequence_len
+            training_args_kwargs["max_completion_length"] = None
+            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
+            training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
+            if self.cfg.dpo_use_weighting is not None:
+                training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
+
        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            output_dir=self.cfg.output_dir,
            per_device_train_batch_size=self.cfg.micro_batch_size,
@@ -1805,7 +1997,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        training_args = self.build_training_arguments(total_num_steps)
        dpo_trainer_kwargs = {}
        if self.cfg.rl == "ipo":
-            dpo_trainer_kwargs["loss_type"] = "ipo"
            if self.cfg.dpo_label_smoothing:
                dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
        if self.eval_dataset:
@@ -1819,12 +2010,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.rl in ["dpo", "ipo"]:
            trainer_cls = AxolotlDPOTrainer
            trainer_cls_args = [self.model, self.model_ref]
-
-            # these aren't used for the ORPO trainer
-            dpo_trainer_kwargs["max_length"] = self.cfg.sequence_len
-            dpo_trainer_kwargs["max_target_length"] = None
-            dpo_trainer_kwargs["max_prompt_length"] = self.cfg.sequence_len
-            dpo_trainer_kwargs["generate_during_eval"] = True
        elif self.cfg.rl == "orpo":
            trainer_cls = AxolotlORPOTrainer
            trainer_cls_args = [self.model]
@@ -1836,11 +2021,17 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            trainer_cls_args = [self.model]
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
+
+        sig = inspect.signature(trainer_cls)
+        if "processing_class" in sig.parameters.keys():
+            dpo_trainer_kwargs["processing_class"] = self.tokenizer
+        else:
+            dpo_trainer_kwargs["tokenizer"] = self.tokenizer
+
        dpo_trainer = trainer_cls(
            *trainer_cls_args,
            args=training_args,
            train_dataset=self.train_dataset,
-            tokenizer=self.tokenizer,
            callbacks=self.get_callbacks(),
            **dpo_trainer_kwargs,
        )
@@ -1862,11 +2053,11 @@ class HFPPOTrainerBuilder(TrainerBuilderBase):
    """

    def get_callbacks(self):
-        callbacks = []
+        callbacks = super().get_callbacks()
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
-        callbacks = []
+        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
        return callbacks

    def build(self, total_num_steps):
--- a/src/axolotl/integrations/LICENSE.md
+++ b/src/axolotl/integrations/LICENSE.md
@@ -0,0 +1,58 @@
+### AXOLOTL COMMUNITY LICENSE AGREEMENT
+
+This Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and
+any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms
+and conditions set forth in this Agreement.
+
+1.  Definitions
+    1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement.
+    1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl,
+        which may be licensed separately by their respective  authors and/or licensors.
+    1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at
+        https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which
+        permits Plugin Integrations to integrate with the Axolotl service.
+2.  Grant of License
+    2.1	Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge,
+        publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions:
+        - Licensee must comply with all the terms and conditions of this Agreement.
+        - Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial
+          portions of the Software.
+    2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.
+3.  Restrictions
+    3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for
+        free or for sale any services, platform, or equivalent  to third parties for the purposes of allowing such
+        third parties to fine-tune artificial intelligence models.
+    3.2 Licensee shall not:
+        - Use the Software for any illegal or unauthorized purpose.
+        - Reverse engineer, decompile, or disassemble the Software.
+        - Remove or modify any copyright, trademark, or other proprietary notices contained in the Software.
+        - Use the Software in a way that could damage, disable, overburden, or impair the functionality of the
+          Software or interfere with any third-party use of the Software.
+    3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.
+4.  Intellectual Property Rights
+    4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee
+        acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to
+        Licensee.
+5.  Disclaimer of Warranty
+    5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+        TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL
+        THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+        CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+        DEALINGS IN THE SOFTWARE.
+6.  Termination
+    6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and
+        conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any
+        copies in its possession.
+7.  Governing Law
+    7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California,
+        without regards to conflicts of laws provisions thereof.
+8.  Entire Agreement
+    8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter
+        hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning
+        the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and
+        Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms
+        on a go-forward basis.  Axolotl will use commercially reasonable efforts to provide Licensee notice of any
+        material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be
+        bound by the terms and conditions of this Agreement.
+
+This Agreement was last updated on August 23, 2024.
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -0,0 +1,432 @@
+# Copyright 2024 Axolotl AI. All rights reserved.
+#
+# This software may be used and distributed according to
+# the terms of the Axolotl Community License Agreement (the "License");
+# you may not use this file except in compliance with the License.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+"""
+Base class for all plugins.
+
+A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.
+Plugins can be used to integrate third-party models, modify the training process, or add new features.
+
+To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
+"""
+import collections
+import importlib
+import logging
+from typing import OrderedDict
+
+
+class BasePlugin:
+    """
+    Base class for all plugins. Defines the interface for plugin methods.
+
+    Attributes:
+    None
+
+    Methods:
+    register(cfg): Registers the plugin with the given configuration.
+    pre_model_load(cfg): Performs actions before the model is loaded.
+    post_model_load(cfg, model): Performs actions after the model is loaded.
+    pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
+    post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
+    create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
+    create_lr_scheduler(cfg, trainer, optimizer): Creates and returns a learning rate scheduler.
+    add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.
+    add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
+    """
+
+    def __init__(self):
+        """
+        Initializes the BasePlugin.
+        """
+
+    def register(self, cfg):  # pylint: disable=unused-argument
+        """
+        Registers the plugin with the given configuration.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+
+        Returns:
+        None
+        """
+
+    def get_input_args(self):
+        """
+        Returns a pydantic model for the plugin's input arguments.
+        """
+
+    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
+        """
+        Performs actions before the model is loaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+
+        Returns:
+        None
+        """
+
+    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after the model is loaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+
+    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions before LoRA weights are loaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+
+    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after LoRA weights are loaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+
+    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
+        """
+        Creates and returns an optimizer for training.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        trainer (object): The trainer object for training.
+
+        Returns:
+        object: The created optimizer.
+        """
+
+    def create_lr_scheduler(
+        self, cfg, trainer, optimizer
+    ):  # pylint: disable=unused-argument
+        """
+        Creates and returns a learning rate scheduler.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        trainer (object): The trainer object for training.
+        optimizer (object): The optimizer for training.
+
+        Returns:
+        object: The created learning rate scheduler.
+        """
+
+    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        setup callbacks before creating the trainer.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        model (object): The loaded model.
+
+        Returns:
+        List[callable]: A list of callback functions to be added to the TrainingArgs
+        """
+        return []
+
+    def add_callbacks_post_trainer(
+        self, cfg, trainer
+    ):  # pylint: disable=unused-argument
+        """
+        Adds callbacks to the trainer after creating the trainer.
+        This is useful for callbacks that require access to the model or trainer.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+        trainer (object): The trainer object for training.
+
+        Returns:
+        List[callable]: A list of callback functions to be added
+        """
+        return []
+
+    def post_train(self, cfg, model):  # pylint: disable=unused-argument
+        """
+        Performs actions after training is complete.
+
+        Parameters:
+        cfg (dict): The axolotl configuration
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+
+    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
+        """
+        Performs actions after training is complete and the model is unloaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+
+        Returns:
+        None
+        """
+
+
+def load_plugin(plugin_name: str) -> BasePlugin:
+    """
+    Loads a plugin based on the given plugin name.
+
+    The plugin name should be in the format "module_name.class_name".
+    This function splits the plugin name into module and class, imports the module,
+    retrieves the class from the module, and creates an instance of the class.
+
+    Parameters:
+    plugin_name (str): The name of the plugin to be loaded. The name should be in the format "module_name.class_name".
+
+    Returns:
+    BasePlugin: An instance of the loaded plugin.
+
+    Raises:
+    ImportError: If the plugin module cannot be imported.
+    """
+    # split the plugin name into module and class
+    module_name, class_name = plugin_name.rsplit(".", 1)
+
+    # import the module
+    module = importlib.import_module(module_name)
+    # instantiate the class
+    plugin_class = getattr(module, class_name)
+    # create an instance of the class
+    plugin = plugin_class()
+
+    return plugin
+
+
+class PluginManager:
+    """
+    The PluginManager class is responsible for loading and managing plugins.
+    It should be a singleton so it can be accessed from anywhere in the codebase.
+
+    Attributes:
+    plugins (List[BasePlugin]): A list of loaded plugins.
+
+    Methods:
+    get_instance(): Static method to get the singleton instance of PluginManager.
+    register(plugin_name: str): Registers a new plugin by its name.
+    pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
+    """
+
+    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()
+
+    _instance = None
+
+    def __new__(cls):
+        """
+        Creates a new instance of PluginManager if it doesn't exist yet.
+        """
+        if cls._instance is None:
+            cls._instance = super(PluginManager, cls).__new__(cls)
+            cls._instance.plugins = collections.OrderedDict()
+        return cls._instance
+
+    @staticmethod
+    def get_instance() -> "PluginManager":
+        """
+        Returns the singleton instance of PluginManager.
+        If the instance doesn't exist, it creates a new one.
+        """
+        if PluginManager._instance is None:
+            PluginManager()
+        return PluginManager._instance  # type: ignore
+
+    def register(self, plugin_name: str):
+        """
+        Registers a new plugin by its name.
+
+        Parameters:
+        plugin_name (str): The name of the plugin to be registered.
+
+        Returns:
+        None
+
+        Raises:
+        ImportError: If the plugin module cannot be imported.
+        """
+        try:
+            plugin = load_plugin(plugin_name)
+            self.plugins[plugin_name] = plugin
+        except ImportError:
+            logging.error(f"Failed to load plugin: {plugin_name}")
+
+    def get_input_args(self):
+        """
+        Returns a list of Pydantic classes for all registered plugins' input arguments.'
+
+        Returns:
+        list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
+        """
+        input_args = []
+        for plugin in self.plugins.values():
+            input_args_from_plugin = plugin.get_input_args()
+            if input_args_from_plugin is not None:
+                input_args.append(input_args_from_plugin)
+        return input_args
+
+    def pre_model_load(self, cfg):
+        """
+        Calls the pre_model_load method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.pre_model_load(cfg)
+
+    def post_model_load(self, cfg, model):
+        """
+        Calls the post_model_load method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.post_model_load(cfg, model)
+
+    def pre_lora_load(self, cfg, model):
+        """
+        Calls the pre_lora_load method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.pre_lora_load(cfg, model)
+
+    def post_lora_load(self, cfg, model):
+        """
+        Calls the post_lora_load method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.post_lora_load(cfg, model)
+
+    def create_optimizer(self, cfg, trainer):
+        """
+        Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        trainer (object): The trainer object for training.
+
+        Returns:
+        object: The created optimizer, or None if none was found.
+        """
+        for plugin in self.plugins.values():
+            optimizer = plugin.create_optimizer(cfg, trainer)
+            if optimizer is not None:
+                return optimizer
+        return None
+
+    def create_lr_scheduler(self, cfg, trainer, optimizer):
+        """
+        Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        trainer (object): The trainer object for training.
+        optimizer (object): The optimizer for training.
+
+        Returns:
+        object: The created learning rate scheduler, or None if none was found.
+        """
+        for plugin in self.plugins.values():
+            scheduler = plugin.create_lr_scheduler(cfg, trainer, optimizer)
+            if scheduler is not None:
+                return scheduler
+        return None
+
+    def add_callbacks_pre_trainer(self, cfg, model):
+        """
+        Calls the add_callbacks_pre_trainer method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
+        """
+        callbacks = []
+        for plugin in self.plugins.values():
+            plugin_callbacks = plugin.add_callbacks_pre_trainer(cfg, model)
+            if plugin_callbacks:  # if the plugin returned a list of callbacks
+                callbacks.extend(plugin_callbacks)
+        return callbacks
+
+    def add_callbacks_post_trainer(self, cfg, trainer):
+        """
+        Calls the add_callbacks_post_trainer method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        trainer (object): The trainer object for training.
+
+        Returns:
+        List[callable]: A list of callback functions to be added to the TrainingArgs.
+        """
+        callbacks = []
+        for plugin in self.plugins.values():
+            plugin_callbacks = plugin.add_callbacks_post_trainer(cfg, trainer)
+            if plugin_callbacks:
+                callbacks.extend(plugin_callbacks)
+        return callbacks
+
+    def post_train_unload(self, cfg):
+        """
+        Calls the post_train_unload method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins.values():
+            plugin.post_train_unload(cfg)
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -0,0 +1,65 @@
+# Copyright 2024 Axolotl AI. All rights reserved.
+#
+# This software may be used and distributed according to
+# the terms of the Axolotl Community License Agreement (the "License");
+# you may not use this file except in compliance with the License.
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+"""
+module to handle merging the plugins' input arguments with the base configurations.
+
+this was moved here to prevent circular imports
+"""
+
+from typing import Any, Dict, List
+
+from axolotl.utils.config.models.input.v0_4_1 import (
+    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
+)
+from axolotl.utils.config.models.input.v0_4_1 import (
+    AxolotlInputConfig as AxolotlInputConfigBase,
+)
+
+
+def merge_input_args():
+    """
+    Merges input arguments from registered plugins with the base configurations.
+
+    This function retrieves the input arguments from registered plugins using the PluginManager.
+    It then dynamically creates new classes, AxolotlConfigWCapabilities and AxolotlInputConfig,
+    that inherit from the base configurations and include the input arguments from the plugins.
+
+    Returns:
+    tuple: A tuple containing the newly created classes, AxolotlConfigWCapabilities and AxolotlInputConfig.
+    """
+    from axolotl.integrations.base import PluginManager
+
+    plugin_manager = PluginManager.get_instance()
+    input_args: List[str] = plugin_manager.get_input_args()
+    plugin_classes = []
+    dynamic_input = ""
+    for plugin_args in input_args:
+        plugin_module, plugin_cls = plugin_args.rsplit(".", 1)
+        dynamic_input += f"from {plugin_module} import {plugin_cls}\n"
+        plugin_classes.append(plugin_cls)
+    if dynamic_input:
+        dynamic_input += f"class AxolotlConfigWCapabilities(AxolotlConfigWCapabilitiesBase, {', '.join(plugin_classes)}):\n    pass\n"
+        dynamic_input += f"class AxolotlInputConfig(AxolotlInputConfigBase, {', '.join(plugin_classes)}):\n    pass\n"
+
+        namespace: Dict[Any, Any] = {}
+        exec(  # pylint: disable=exec-used  # nosec B102
+            dynamic_input, globals(), namespace
+        )
+        AxolotlInputConfig = namespace[  # pylint: disable=invalid-name
+            "AxolotlInputConfig"
+        ]
+        AxolotlConfigWCapabilities = namespace[  # pylint: disable=invalid-name
+            "AxolotlConfigWCapabilities"
+        ]
+        return AxolotlConfigWCapabilities, AxolotlInputConfig
+    return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
--- a/src/axolotl/integrations/grokfast/LICENSE
+++ b/src/axolotl/integrations/grokfast/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/src/axolotl/integrations/grokfast/README.md
+++ b/src/axolotl/integrations/grokfast/README.md
@@ -0,0 +1,13 @@
+# Grokfast Optimizer
+
+See https://github.com/ironjr/grokfast
+
+### Usage
+
+```yaml
+plugins:
+  - axolotl.integrations.grokfast.GrokfastPlugin
+
+grokfast_alpha: 2.0
+grokfast_lamb: 0.98
+```
--- a/src/axolotl/integrations/grokfast/init.py
+++ b/src/axolotl/integrations/grokfast/init.py
@@ -0,0 +1,50 @@
+"""
+Grokfast plugin for Axolotl
+"""
+import logging
+
+from transformers.trainer_callback import TrainerCallback
+
+from ..base import BasePlugin
+from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
+from .optimizer import gradfilter_ema
+
+LOG = logging.getLogger("axolotl.integrations.grokfast")
+
+
+class GrokfastCallbackHandler(TrainerCallback):
+    """
+    Transformer trainer callbacks for Grokfast
+    """
+
+    def __init__(self, *args_, alpha=0.98, lamb=2.0, **kwargs):
+        super().__init__(*args_, **kwargs)
+        self.grads = None
+        self.alpha = alpha
+        self.lamb = lamb
+
+    def on_train_begin(self, *args_, **kwargs):  # pylint: disable=unused-argument
+        self.grads = None
+
+    def on_pre_optimizer_step(
+        self, args_, state, control, **kwargs
+    ):  # pylint: disable=unused-argument
+        model = kwargs.pop("model")
+        self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
+        return control
+
+
+class GrokfastPlugin(BasePlugin):
+    """
+    Plugin for Grokfast optimizer integraton with Axolotl.
+    """
+
+    def get_input_args(self):
+        return "axolotl.integrations.grokfast.GrokfastArgs"
+
+    def add_callbacks_post_trainer(self, cfg, trainer):
+        LOG.info("Adding Grokfast callback to the trainer")
+        callback = GrokfastCallbackHandler(
+            alpha=cfg.grokfast_alpha, lamb=cfg.grokfast_lamb
+        )
+        return [callback]
--- a/src/axolotl/integrations/grokfast/args.py
+++ b/src/axolotl/integrations/grokfast/args.py
@@ -0,0 +1,15 @@
+"""
+config args for grokfast plugin
+"""
+from typing import Optional
+
+from pydantic import BaseModel
+
+
+class GrokfastArgs(BaseModel):
+    """
+    Input args for Grokfast optimizer.
+    """
+
+    grokfast_alpha: Optional[float] = 0.98
+    grokfast_lamb: Optional[float] = 2.0
--- a/src/axolotl/integrations/grokfast/optimizer.py
+++ b/src/axolotl/integrations/grokfast/optimizer.py
@@ -0,0 +1,63 @@
+# Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
+# Reference: https://github.com/ironjr/grokfast
+
+# pylint: skip-file
+from collections import deque
+from typing import Dict, Literal, Optional
+
+import torch
+import torch.nn as nn
+
+
+def gradfilter_ma(
+    m: nn.Module,
+    grads: Optional[Dict[str, deque]] = None,
+    window_size: int = 100,
+    lamb: float = 5.0,
+    filter_type: Literal["mean", "sum"] = "mean",
+    warmup: bool = True,
+    trigger: bool = False,  # For ablation study.
+) -> Dict[str, deque]:
+    if grads is None:
+        grads = {
+            n: deque(maxlen=window_size)
+            for n, p in m.named_parameters()
+            if p.requires_grad and p.grad is not None
+        }
+
+    for n, p in m.named_parameters():
+        if p.requires_grad and p.grad is not None:
+            grads[n].append(p.grad.data.detach())  # .cpu())
+
+            # Modify the gradients.
+            if not warmup or len(grads[n]) == window_size and not trigger:
+                if filter_type == "mean":
+                    avg = sum(grads[n]) / len(grads[n])
+                elif filter_type == "sum":
+                    avg = sum(grads[n])
+                else:
+                    raise ValueError(f"Unrecognized filter_type {filter_type}")
+                p.grad.data = p.grad.data + avg * lamb
+
+    return grads
+
+
+def gradfilter_ema(
+    m: nn.Module,
+    grads: Optional[Dict[str, torch.Tensor]] = None,
+    alpha: float = 0.98,
+    lamb: float = 2.0,
+) -> Dict[str, torch.Tensor]:
+    if grads is None:
+        grads = {
+            n: p.grad.data.detach()
+            for n, p in m.named_parameters()
+            if p.requires_grad and p.grad is not None
+        }
+
+    for n, p in m.named_parameters():
+        if p.requires_grad and p.grad is not None:
+            grads[n] = grads[n] * alpha + p.grad.data.detach() * (1 - alpha)
+            p.grad.data = p.grad.data + grads[n] * lamb
+
+    return grads
--- a/src/axolotl/integrations/liger/LICENSE
+++ b/src/axolotl/integrations/liger/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -0,0 +1,116 @@
+# Copyright 2024 Axolotl AI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Module for the Plugin for LIGER integraton with Axolotl.
+
+Liger Kernel is the collection of Triton-native kernels for LLM Training.
+It is designed to be performant, correct, and light-weight.
+"""
+import inspect
+import logging
+import sys
+
+from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
+from liger_kernel.transformers.functional import liger_cross_entropy
+from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
+from liger_kernel.transformers.rms_norm import LigerRMSNorm
+from liger_kernel.transformers.rope import liger_rotary_pos_emb
+from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
+
+from axolotl.integrations.base import BasePlugin
+
+from ...utils.distributed import zero_only
+from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401
+
+LOG = logging.getLogger("axolotl.integrations.liger")
+
+
+class LigerPlugin(BasePlugin):
+    """
+    Plugin for LIGER integraton with Axolotl.
+    """
+
+    def get_input_args(self):
+        return "axolotl.integrations.liger.LigerArgs"
+
+    def pre_model_load(self, cfg):
+        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
+            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
+            liger_fn_sig = inspect.signature(apply_liger_fn)
+            kwargs = {}
+            if "rope" in liger_fn_sig.parameters:
+                kwargs["rope"] = cfg.liger_rope
+            if "cross_entropy" in liger_fn_sig.parameters:
+                kwargs["cross_entropy"] = cfg.liger_cross_entropy
+            if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
+                kwargs[
+                    "fused_linear_cross_entropy"
+                ] = cfg.liger_fused_linear_cross_entropy
+            if "rms_norm" in liger_fn_sig.parameters:
+                kwargs["rms_norm"] = cfg.liger_rms_norm
+            if "layer_norm" in liger_fn_sig.parameters:
+                kwargs["layer_norm"] = cfg.liger_layer_norm
+            if "geglu" in liger_fn_sig.parameters:
+                kwargs["geglu"] = cfg.liger_glu_activation
+            elif "swiglu" in liger_fn_sig.parameters:
+                kwargs["swiglu"] = cfg.liger_glu_activation
+            with zero_only():
+                LOG.info(
+                    f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
+                )
+            apply_liger_fn(**kwargs)
+        elif cfg.model_config_type == "jamba":
+            from transformers.models.jamba import modeling_jamba
+
+            from .models.jamba import lce_forward as jamba_lce_forward
+
+            if cfg.liger_rope:
+                modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
+            if cfg.liger_rms_norm:
+                modeling_jamba.JambaRMSNorm = LigerRMSNorm
+            if cfg.liger_glu_activation:
+                modeling_jamba.JambaMLP = LigerSwiGLUMLP
+            if cfg.liger_cross_entropy:
+                from transformers.loss.loss_utils import nn
+
+                nn.functional.cross_entropy = liger_cross_entropy
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
+        elif cfg.model_config_type == "deepseek_v2":
+            from accelerate import init_empty_weights
+            from transformers import AutoModelForCausalLM
+
+            with init_empty_weights():
+                model = AutoModelForCausalLM.from_pretrained(
+                    cfg.base_model, trust_remote_code=cfg.trust_remote_code or False
+                )
+                modeling_mod = sys.modules[model.__class__.__module__]
+
+            from .models.deepseekv2 import lce_forward as deepseekv2_lce_forward
+
+            if cfg.liger_rope:
+                # The DeepseekV2 version of RoPE is different than upstream LLaMA.
+                # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528
+                logging.warning("Fused liger_rope is not supported for DeepseekV2.")
+            if cfg.liger_rms_norm:
+                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
+            if cfg.liger_glu_activation:
+                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
+            if cfg.liger_cross_entropy:
+                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
+                # nn.CrossEntropyLoss in the forward method.
+                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
+            if cfg.liger_fused_linear_cross_entropy:
+                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -0,0 +1,53 @@
+# Copyright 2024 Axolotl AI. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Module for handling LIGER input arguments.
+"""
+import logging
+from typing import Optional
+
+from pydantic import BaseModel, model_validator
+
+LOG = logging.getLogger("axolotl.integrations.liger.args")
+
+
+class LigerArgs(BaseModel):
+    """
+    Input args for LIGER.
+    """
+
+    liger_rope: Optional[bool] = None
+    liger_rms_norm: Optional[bool] = None
+    liger_layer_norm: Optional[bool] = None
+    liger_swiglu: Optional[bool] = None
+    liger_glu_activation: Optional[bool] = None
+    liger_cross_entropy: Optional[bool] = None
+    liger_fused_linear_cross_entropy: Optional[bool] = None
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_deprecated_swiglu(cls, data):
+        if data.get("liger_swiglu") is not None:
+            if data.get("liger_glu_activation") is not None:
+                raise ValueError(
+                    "You cannot have both `liger_swiglu` and `liger_glu_activation` set."
+                )
+
+            LOG.warning(
+                "The 'liger_swiglu' argument is deprecated and will be removed in a future release. "
+                "Please use 'liger_glu_activation' instead."
+            )
+            data["liger_glu_activation"] = data.pop("liger_swiglu")
+        return data
--- a/src/axolotl/integrations/liger/models/deepseekv2.py
+++ b/src/axolotl/integrations/liger/models/deepseekv2.py
@@ -0,0 +1,127 @@
+"""
+DeepseekV2 model with LigerFusedLinearCrossEntropyLoss
+"""
+# pylint: disable=duplicate-code
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+from liger_kernel.transformers.fused_linear_cross_entropy import (
+    LigerFusedLinearCrossEntropyLoss,
+)
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+
+# @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
+# @replace_return_docstrings(
+#    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+# )
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM
+
+    >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+    >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+
+    hidden_states = outputs[0]
+
+    loss = None
+    logits = None
+
+    if self.training:
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+
+        # flatten tokens
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
+        shift_labels = shift_labels.view(-1)
+
+        lce = LigerFusedLinearCrossEntropyLoss()
+        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+    else:
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
--- a/src/axolotl/integrations/liger/models/jamba.py
+++ b/src/axolotl/integrations/liger/models/jamba.py
@@ -0,0 +1,173 @@
+"""
+Jamba model with LigerFusedLinearCrossEntropyLoss
+"""
+# pylint: disable=duplicate-code
+
+from typing import Optional, Tuple, Union
+
+import torch
+from liger_kernel.transformers.fused_linear_cross_entropy import (
+    LigerFusedLinearCrossEntropyLoss,
+)
+from torch.nn import CrossEntropyLoss
+from transformers.modeling_outputs import MoeCausalLMOutputWithPast
+from transformers.models.jamba.modeling_jamba import (
+    _CONFIG_FOR_DOC,
+    JAMBA_INPUTS_DOCSTRING,
+    HybridMambaAttentionDynamicCache,
+    load_balancing_loss_func,
+)
+from transformers.utils import (
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+
+
+@add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
+@replace_return_docstrings(
+    output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
+)
+def lce_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    num_logits_to_keep: Optional[Union[int, None]] = None,
+) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        num_logits_to_keep (`int` or `None`, *optional*):
+            Calculate logits for the last `num_logits_to_keep` tokens. If `None`, calculate logits for all
+            `input_ids`. Only last token logits are needed for generation, and calculating them only for that token
+            can save memory, which becomes pretty significant for long sequences.
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, JambaForCausalLM
+
+    >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_router_logits = (
+        output_router_logits
+        if output_router_logits is not None
+        else self.config.output_router_logits
+    )
+
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        cache_position=cache_position,
+        return_dict=return_dict,
+    )
+
+    hidden_states = outputs[0]
+
+    loss = None
+    logits = None
+
+    if self.training:
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+
+        # flatten tokens
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
+        shift_labels = shift_labels.view(-1)
+
+        lce = LigerFusedLinearCrossEntropyLoss()
+        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+    else:
+        if num_logits_to_keep is None:
+            logits = self.lm_head(hidden_states)
+        else:
+            logits = self.lm_head(hidden_states[..., -num_logits_to_keep:, :])
+        logits = logits.float()
+
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits if return_dict else outputs[-1],
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(
+                loss.device
+            )  # make sure to reside in the same device
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        if output_router_logits:
+            output = (aux_loss,) + output
+        return (loss,) + output if loss is not None else output
+
+    return MoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        router_logits=outputs.router_logits,
+    )
--- a/src/axolotl/integrations/lm_eval/README.md
+++ b/src/axolotl/integrations/lm_eval/README.md
@@ -0,0 +1,13 @@
+# LM Eval Harness
+
+### Usage
+
+```yaml
+plugins:
+  - axolotl.integrations.lm_eval.LMEvalPlugin
+
+lm_eval_tasks:
+  - gsm8k
+  - hellaswag
+  - arc_easy
+```
--- a/src/axolotl/integrations/lm_eval/init.py
+++ b/src/axolotl/integrations/lm_eval/init.py
@@ -0,0 +1,42 @@
+"""
+Module for the Plugin for LM Eval Harness
+"""
+import subprocess  # nosec
+from datetime import datetime
+
+from axolotl.integrations.base import BasePlugin
+
+from .args import LMEvalArgs  # pylint: disable=unused-import. # noqa: F401
+
+
+class LMEvalPlugin(BasePlugin):
+    """
+    Plugin for LM Evaluation Harness integraton with Axolotl.
+    """
+
+    def get_input_args(self):
+        return "axolotl.integrations.lm_eval.LMEvalArgs"
+
+    def post_train_unload(self, cfg):
+        tasks = ",".join(cfg.lm_eval_tasks)
+        fa2 = ",attn_implementation=flash_attention_2" if cfg.flash_attention else ""
+        dtype = ",dtype=bfloat16" if cfg.bf16 else ",dtype=float16"
+        output_path = cfg.output_dir
+        output_path += "" if cfg.output_dir.endswith("/") else "/"
+        output_path += "lm_eval_results/" + datetime.now().strftime("%Y%m%d_%H%M%S")
+        subprocess.run(  # nosec
+            [
+                "lm_eval",
+                "--model",
+                "hf",
+                "--model_args",
+                f"pretrained={cfg.output_dir}{fa2}{dtype}",
+                "--tasks",
+                tasks,
+                "--batch_size",
+                str(cfg.lm_eval_batch_size),
+                "--output_path",
+                output_path,
+            ],
+            check=True,
+        )
--- a/src/axolotl/integrations/lm_eval/args.py
+++ b/src/axolotl/integrations/lm_eval/args.py
@@ -0,0 +1,15 @@
+"""
+Module for handling lm eval harness input arguments.
+"""
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+class LMEvalArgs(BaseModel):
+    """
+    Input args for lm eval harness
+    """
+
+    lm_eval_tasks: List[str] = []
+    lm_eval_batch_size: Optional[int] = 8
--- a/src/axolotl/integrations/sageattention/init.py
+++ b/src/axolotl/integrations/sageattention/init.py
--- a/src/axolotl/integrations/sageattention/lib/core.py
+++ b/src/axolotl/integrations/sageattention/lib/core.py
@@ -0,0 +1,361 @@
+"""
+Copyright (c) 2024 by SageAttention team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from typing import Any, Optional
+
+import torch
+from torch.autograd import Function
+
+from .triton.attn_qk_int8_per_block_causal_varlen import (
+    backward as sageattn_varlen_backward,
+)
+from .triton.attn_qk_int8_per_block_causal_varlen import forward as attn_true_varlen
+from .triton.quant_per_block_varlen import (
+    per_block_int8 as per_block_int8_varlen_triton,
+)
+
+
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+
+
+def sageattn_varlen(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    sm_scale: Optional[float] = None,
+    smooth_k: bool = True,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor, shape: ``[cu_seqlens_q[-1], num_qo_heads, head_dim]``.
+
+    k : torch.Tensor
+        The key tensor, shape: ``[cu_seqlens_k[-1], num_kv_heads, head_dim]``.
+
+    v : torch.Tensor
+        The value tensor, shape: ``[cu_seqlens_k[-1], num_kv_heads, head_dim]``.
+
+    cu_seqlens_q : torch.Tensor
+        The cumulative sequence lengths for the query sequences in the batch, used to index into `q`.
+        Shape: ``[batch_size + 1]``, where each entry represents the cumulative length of sequences up to that batch index.
+
+    cu_seqlens_k : torch.Tensor
+        The cumulative sequence lengths for the key and value sequences in the batch, used to index into `k` and `v`.
+        Shape: ``[batch_size + 1]``, where each entry represents the cumulative length of sequences up to that batch index.
+
+    max_seqlen_q : int
+        The maximum sequence length for the query tensor in the batch.
+
+    max_seqlen_k : int
+        The maximum sequence length for the key and value tensors in the batch.
+
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len for each sequence.
+        Default: False.
+
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+
+    Returns
+    -------
+    torch.Tensor
+        The output tensor, shape: ``[cu_seqlens_q[-1], num_qo_heads, head_dim]``.
+
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16``, ``torch.bfloat16`` or ``torch.float32``.
+    - The tensors `cu_seqlens_q` and `cu_seqlens_k` must have the dtype ``torch.int32`` or ``torch.int64``.
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [
+        torch.float16,
+        torch.bfloat16,
+    ], "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+
+    head_dim = q.size(-1)
+    assert head_dim in [64, 128], "varlen only support head_dim [64, 128]."
+
+    assert (
+        q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1
+    ), "Last dim of qkv must be contiguous."
+    assert (
+        cu_seqlens_q.is_contiguous() and cu_seqlens_k.is_contiguous()
+    ), "cu_seqlens_q and cu_seqlens_k must be contiguous."
+
+    if dtype == torch.bfloat16 or dtype == torch.float32:
+        v = v.to(torch.float16)
+
+    if smooth_k:
+        km = k.mean(
+            dim=0, keepdim=True
+        )  # ! km is calculated on the all the batches. Calculate over each individual sequence requires dedicated kernel.
+        k -= km
+
+    (
+        q_int8,
+        q_scale,
+        k_int8,
+        k_scale,
+        cu_seqlens_q_scale,
+        cu_seqlens_k_scale,
+    ) = per_block_int8_varlen_triton(
+        q, k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, sm_scale=sm_scale
+    )
+
+    o = attn_true_varlen(
+        q_int8,
+        k_int8,
+        v,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        q_scale,
+        k_scale,
+        cu_seqlens_q_scale,
+        cu_seqlens_k_scale,
+        output_dtype=dtype,
+    )
+
+    return o
+
+
+class SageAttentionFunction(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        query,
+        key,
+        value,
+        attn_mask=None,
+        dropout_p=0.0,
+        is_causal=False,
+        scale=None,
+    ):
+        """
+        query: Tensor of shape [batch_size, num_heads, seq_len_q, head_dim]
+        key: Tensor of shape [batch_size, num_heads, seq_len_k, head_dim]
+        value: Tensor of shape [batch_size, num_heads, seq_len_k, head_dim]
+        attn_mask: Optional[Tensor], mask tensor
+        dropout_p: float, dropout probability
+        is_causal: bool, whether to apply causal masking
+        scale: Optional[float], scaling factor for attention scores
+        """
+        # Ensure inputs are contiguous
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+        # Handle default scale
+        if scale is None:
+            scale = 1.0 / (query.size(-1) ** 0.5)
+
+        # Save parameters needed for backward
+        ctx.scale = scale
+        ctx.is_causal = is_causal
+        ctx.dropout_p = dropout_p
+        ctx.attn_mask = attn_mask
+
+        # Prepare cumulative sequence lengths and max sequence lengths
+        # Assuming batch sizes are consistent across query, key, and value
+        batch_size, num_heads, seq_len_q, head_dim = query.shape
+        seq_len_k = key.shape[2]
+
+        # Flatten batch and head dimensions
+        q = query.view(
+            -1, seq_len_q, head_dim
+        )  # [batch_size * num_heads, seq_len_q, head_dim]
+        k = key.view(-1, seq_len_k, head_dim)
+        v = value.view(-1, seq_len_k, head_dim)
+
+        # Create cumulative sequence lengths
+        cu_seqlens_q = torch.arange(
+            0,
+            (batch_size * num_heads + 1) * seq_len_q,
+            seq_len_q,
+            dtype=torch.int32,
+            device=query.device,
+        )
+        cu_seqlens_k = torch.arange(
+            0,
+            (batch_size * num_heads + 1) * seq_len_k,
+            seq_len_k,
+            dtype=torch.int32,
+            device=key.device,
+        )
+        max_seqlen_q = seq_len_q
+        max_seqlen_k = seq_len_k
+
+        # Call your custom per-block int8 quantization function
+        (
+            q_int8,
+            q_scale,
+            k_int8,
+            k_scale,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+        ) = per_block_int8_varlen_triton(
+            q, k, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, sm_scale=scale
+        )
+
+        # Call your custom attention function
+        if is_causal:
+            output = attn_true_varlen(
+                q_int8,
+                k_int8,
+                v,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                q_scale,
+                k_scale,
+                cu_seqlens_q_scale,
+                cu_seqlens_k_scale,
+                output_dtype=query.dtype,
+            )
+        else:
+            raise NotImplementedError("Non-causal attention is not implemented yet.")
+
+        # Reshape output to match the expected shape
+        output = output.view(batch_size, num_heads, seq_len_q, head_dim)
+
+        # Save tensors for backward
+        ctx.save_for_backward(
+            query,
+            key,
+            value,
+            q_int8,
+            k_int8,
+            q_scale,
+            k_scale,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+            output,
+        )
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        (
+            query,
+            key,
+            value,
+            q_int8,
+            k_int8,
+            q_scale,
+            k_scale,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+            output,
+        ) = ctx.saved_tensors
+
+        scale = ctx.scale
+        is_causal = ctx.is_causal
+        dropout_p = ctx.dropout_p
+        attn_mask = ctx.attn_mask
+
+        # Flatten batch and head dimensions
+        batch_size, num_heads, seq_len_q, head_dim = query.shape
+        seq_len_k = key.shape[2]
+        grad_output = grad_output.contiguous()
+        do = grad_output.view(-1, seq_len_q, head_dim)
+
+        # Compute gradients w.r.t. q, k, v
+        dq, dk, dv = sageattn_varlen_backward(
+            do,
+            query.view(-1, seq_len_q, head_dim),
+            key.view(-1, seq_len_k, head_dim),
+            value.view(-1, seq_len_k, head_dim),
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seq_len_q,
+            seq_len_k,
+            q_int8,
+            k_int8,
+            q_scale,
+            k_scale,
+            cu_seqlens_q_scale,
+            cu_seqlens_k_scale,
+            scale,
+            is_causal,
+        )
+
+        # Reshape gradients to match the input shapes
+        dq = dq.view(batch_size, num_heads, seq_len_q, head_dim)
+        dk = dk.view(batch_size, num_heads, seq_len_k, head_dim)
+        dv = dv.view(batch_size, num_heads, seq_len_k, head_dim)
+
+        # Handle optional arguments
+        d_attn_mask = None  # Assuming attn_mask does not require gradients
+        d_dropout_p = (
+            None  # Dropout probability is a hyperparameter, typically not optimized
+        )
+        d_is_causal = None  # Not differentiable
+        d_scale = None  # If scale is a tensor and requires grad, compute its gradient
+
+        return dq, dk, dv, d_attn_mask, d_dropout_p, d_is_causal, d_scale
+
+
+def scaled_dot_product_attention(
+    query,
+    key,
+    value,
+    attn_mask=None,
+    dropout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    """
+    Custom scaled dot product attention using SageAttentionFunction.
+    """
+    return SageAttentionFunction.apply(
+        query, key, value, attn_mask, dropout_p, is_causal, scale
+    )
+
+
+def monkeypatch_sdp_w_sage_attention():
+    """
+    Replace torch.nn.functional.scaled_dot_product_attention with custom scaled dot product attention using SageAttentionFunction.
+    """
+    torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
--- a/Show More
+++ b/Show More