Simplify creating parameters

Stop transformers from using all memory
Simplify conversion + more debug
2024-03-18 12:32:59 +00:00 · 2024-03-18 11:47:47 +00:00 · 2024-03-17 20:21:46 +00:00 · 2024-03-17 19:52:56 +01:00 · 2024-03-17 19:51:31 +01:00 · 2024-03-17 19:48:52 +01:00
101 changed files with 3425 additions and 3418 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,22 +16,17 @@ jobs:
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.10"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.11"
            pytorch: 2.1.2
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "121"
-            cuda_version: 12.1.0
-            python_version: "3.11"
-            pytorch: 2.2.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v3
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,31 +0,0 @@
-name: Publish Docs
-on:
-  push:
-    branches:
-      - main
-
-permissions:
-    contents: write
-    pages: write
-
-jobs:
-    build-deploy:
-        runs-on: ubuntu-latest
-        steps:
-        - name: Check out repository
-          uses: actions/checkout@v4
-        - name: Set up Quarto
-          uses: quarto-dev/quarto-actions/setup@v2
-        - name: Setup Python
-          uses: actions/setup-python@v3
-          with:
-            python-version: '3.10'
-        - name: install dependencies
-          run: |
-            python3 -m pip install jupyter
-        - name: Publish to GitHub Pages (and render)
-          uses: quarto-dev/quarto-actions/publish@v2
-          with:
-            target: gh-pages
-          env:
-            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -28,7 +28,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.1.2
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -63,7 +63,7 @@ jobs:
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

-  build-axolotl-cloud:
+  build-axolotl-runpod:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
@@ -84,7 +84,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.1.2
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -113,5 +113,7 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+             winglian/axolotl-runpod:main-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
+             ${{ (matrix.is_latest) && format('{0}-latest', 'winglian/axolotl-runpod:main') || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -1,118 +0,0 @@
-name: docker-nightlies
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
-
-jobs:
-  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.10"
-            pytorch: 2.1.2
-            axolotl_extras:
-            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
-            is_latest: true
-          - cuda: 121
-            cuda_version: 12.1.0
-            python_version: "3.10"
-            pytorch: 2.1.2
-            axolotl_extras:
-          - cuda: 121
-            cuda_version: 12.1.0
-            python_version: "3.11"
-            pytorch: 2.2.1
-            axolotl_extras:
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: winglian/axolotl
-          tags: |
-            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
-      - name: Build and export to Docker
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
-            CUDA=${{ matrix.cuda }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
-          file: ./docker/Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
-
-  build-axolotl-cloud:
-    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
-    # this job needs to be run on self-hosted GPU runners...
-    strategy:
-      matrix:
-        include:
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.10"
-            pytorch: 2.1.2
-            axolotl_extras:
-            is_latest: true
-          - cuda: 121
-            cuda_version: 12.1.0
-            python_version: "3.10"
-            pytorch: 2.1.2
-            axolotl_extras:
-          - cuda: 121
-            cuda_version: 12.1.0
-            python_version: "3.11"
-            pytorch: 2.2.1
-            axolotl_extras:
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: winglian/axolotl-cloud
-          tags: |
-            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      - name: Build
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            CUDA=${{ matrix.cuda }}
-          file: ./docker/Dockerfile-cloud
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -25,7 +25,7 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install wheel packaging
+          pip3 install wheel
          pip3 install -e .
          pip3 install -r requirements-tests.txt

--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -34,7 +34,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
-    timeout-minutes: 20
+    timeout-minutes: 10

    steps:
      - name: Check out repository code
@@ -48,8 +48,6 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging
          pip3 install -U -e .
          pip3 install -r requirements-tests.txt

@@ -79,11 +77,6 @@ jobs:
            python_version: "3.10"
            pytorch: 2.1.2
            num_gpus: 1
-          - cuda: 121
-            cuda_version: 12.1.0
-            python_version: "3.11"
-            pytorch: 2.2.1
-            num_gpus: 1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
 configs
 last_run_prepared/
 .vscode
-_site/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -173,5 +172,3 @@ wandb
 lora-out/*
 qlora-out/*
 mlruns/*
-
-/.quarto/
--- a/README.md
+++ b/README.md
@@ -13,9 +13,6 @@ Features:
 - Log results and optionally checkpoints to wandb or mlflow
 - And more!

-<a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
-  <img alt="phorm.ai" src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=">
-</a>

 <table>
 <tr>
@@ -31,19 +28,18 @@ Features:
  - [Cloud GPU](#cloud-gpu) - Latitude.sh, JarvisLabs, RunPod
  - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
  - [Windows](#windows)
-  - [Mac](#mac)
-  - [Google Colab](#google-colab)
  - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
+  - [How to Add Custom Prompts](#how-to-add-custom-prompts)
+  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
 - [Config](#config)
  - [Train](#train)
  - [Inference](#inference-playground)
  - [Merge LORA to Base](#merge-lora-to-base)
  - [Special Tokens](#special-tokens)
-  - [All Config Options](#all-config-options)
 - Advanced Topics
-  - [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
-  - [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [Multipack](./docs/multipack.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [RLHF & DPO](./docs/rlhf.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
 - [Common Errors](#common-errors-)
  - [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
 - [Debugging Axolotl](#debugging-axolotl)
@@ -103,14 +99,24 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo

 **Requirements**: Python >=3.10 and Pytorch >=2.1.1.

+### For developers
 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl

-pip3 install packaging ninja
+pip3 install packaging
+```
+
+General case:
+```
 pip3 install -e '.[flash-attn,deepspeed]'
 ```

+Mac: see https://github.com/OpenAccess-AI-Collective/axolotl/blob/13199f678b9aab39e92961323bdbce3234ee4b2b/docs/mac.md
+```
+pip3 install -e '.'
+```
+
 ### Usage
 ```bash
 # preprocess datasets - optional but recommended
@@ -149,7 +155,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAcc
  ```

 >[!Tip]
-> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.qmd#debugging-with-docker).
+> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.md#debugging-with-docker).

  <details>

@@ -221,51 +227,31 @@ For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud
  python get-pip.py
  ```

-  3. Install Pytorch https://pytorch.org/get-started/locally/
-
-  4. Follow instructions on quickstart.
-
-  5. Run
+  3. Install torch
  ```bash
+  pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
+  ```
+
+  4. Axolotl
+  ```bash
+  git clone https://github.com/OpenAccess-AI-Collective/axolotl
+  cd axolotl
+
+  pip3 install packaging
+  pip3 install -e '.[flash-attn,deepspeed]'
  pip3 install protobuf==3.20.3
  pip3 install -U --ignore-installed requests Pillow psutil scipy
  ```

-  6. Set path
+  5. Set path
  ```bash
  export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
  ```
  </details>

-##### GCP
-
-<details>
-
-<summary>Click to Expand</summary>
-
-Use a Deeplearning linux OS with cuda and pytorch installed. Then follow instructions on quickstart.
-
-Make sure to run the below to uninstall xla.
-```bash
-pip uninstall -y torch_xla[tpu]
-```
-
-</details>
-
 #### Windows
 Please use WSL or Docker!

-#### Mac
-
-Use the below instead of the install method in QuickStart.
-```
-pip3 install -e '.'
-```
-More info: [mac.md](/docs/mac.qmd)
-
-#### Google Colab
-
-Please use this example [notebook](examples/colab-notebooks/colab-axolotl-example.ipynb).

 #### Launching on public clouds via SkyPilot
 To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
@@ -292,9 +278,186 @@ HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKE

 ### Dataset

-Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL.  The schema of the JSONL depends upon the task and the prompt template you wish to use.  Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.
+Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
+Have dataset(s) in one of the following format (JSONL recommended):

-See [these docs](https://openaccess-ai-collective.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.
+#### Pretraining
+
+- `completion`: raw corpus
+  ```json
+  {"text": "..."}
+  ```
+
+Note: Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
+
+```yaml
+pretraining_dataset: # hf path only
+```
+
+#### Supervised finetuning
+
+##### Instruction
+
+- `alpaca`: instruction; input(optional)
+  ```json
+  {"instruction": "...", "input": "...", "output": "..."}
+  ```
+
+<details>
+
+<summary>See other formats</summary>
+
+- `jeopardy`: question and answer
+  ```json
+  {"question": "...", "category": "...", "answer": "..."}
+  ```
+- `oasst`: instruction
+  ```json
+  {"INSTRUCTION": "...", "RESPONSE": "..."}
+  ```
+- `gpteacher`: instruction; input(optional)
+  ```json
+  {"instruction": "...", "input": "...", "response": "..."}
+  ```
+- `reflection`: instruction with reflect; input(optional)
+  ```json
+  {"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
+  ```
+- `explainchoice`: question, choices, (solution OR explanation)
+  ```json
+  {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+  ```
+- `concisechoice`: question, choices, (solution OR explanation)
+  ```json
+  {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+  ```
+- `summarizetldr`: article and summary
+  ```json
+  {"article": "...", "summary": "..."}
+  ```
+- `alpaca_chat`: basic instruct for alpaca chat
+  ```json
+  {"instruction": "...", "input": "...", "response": "..."}
+  ```
+- `alpaca_chat.load_qa`: question and answer for alpaca chat
+  ```json
+  {"question": "...", "answer": "..."}
+  ```
+- `alpaca_chat.load_concise`: question and answer for alpaca chat, for concise answers
+  ```json
+  {"instruction": "...", "input": "...", "response": "..."}
+  ```
+- `alpaca_chat.load_camel_ai`: question and answer for alpaca chat, for load_camel_ai
+  ```json
+  {"message_1": "...", "message_2": "..."}
+  ```
+- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
+  ```json
+  {"system_prompt": "...", "question": "...", "response": "..."}
+  ```
+- `context_qa`: in context question answering from an article
+  ```json
+  {"article": "...", "question": "...", "answer": "..."}
+  ```
+- `context_qa.load_v2`: in context question answering (alternate)
+  ```json
+  {"context": "...", "question": "...", "answer": "..."}
+  ```
+- `context_qa.load_404`: in context question answering from an article, with default response for no answer from context
+  ```json
+  {"article": "...", "unanswerable_question": "..."}
+  ```
+- `creative_acr.load_answer`: instruction and revision
+  ```json
+  {"instruction": "...", "revision": "..."}
+  ```
+- `creative_acr.load_critique`: critique
+  ```json
+  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
+  ```
+- `creative_acr.load_revise`: critique and revise
+  ```json
+  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
+  ```
+- `metharme`: instruction, adds additional eos tokens
+  ```json
+  {"prompt": "...", "generation": "..."}
+  ```
+
+</details>
+
+##### Template-Free
+
+- `input_output`: template-free prompt construction
+  ```json
+   {"segments": [{"label": true|false, "text": "..."}]}
+  ```
+
+This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.md) for more details.
+
+##### Conversation
+
+- `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
+  ```json
+  {"conversations": [{"from": "...", "value": "..."}]}
+  ```
+
+<details>
+
+<summary>See other formats</summary>
+
+- `pygmalion`: pygmalion
+  ```json
+  {"conversations": [{"role": "...", "value": "..."}]}
+  ```
+- `sharegpt.load_role`: conversations where `role` is used instead of `from`
+  ```json
+  {"conversations": [{"role": "...", "value": "..."}]}
+  ```
+- `sharegpt.load_guanaco`: conversations where `from` is `prompter`/`assistant` instead of default sharegpt
+  ```json
+  {"conversations": [{"from": "...", "value": "..."}]}
+  ```
+- `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
+  ```json
+  {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
+  ```
+
+</details>
+
+Note: `type: sharegpt` opens a special config `conversation:` that enables conversions to many Conversation types. See dataset section under [all yaml options](#all-yaml-options).
+
+#### How to add custom prompts
+
+For a dataset that is preprocessed for instruction purposes:
+
+```json
+{"input": "...", "output": "..."}
+```
+
+You can use this example in your YAML config:
+
+```yaml
+datasets:
+  - path: repo
+    type:
+      system_prompt: ""
+      field_system: system
+      field_instruction: input
+      field_output: output
+      format: "[INST] {instruction} [/INST]"
+      no_input_format: "[INST] {instruction} [/INST]"
+```
+See full config options under [all yaml options](#all-yaml-options).
+
+#### How to use your custom pretokenized dataset
+
+- Do not pass a `type:`
+- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
+
+```yaml
+- path: ...
+```

 ### Config

@@ -379,9 +542,485 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
    - v_proj
  ```

-#### All Config Options
+<details id="all-yaml-options">

-See [these docs](docs/config.qmd) for all config options.
+<summary>All yaml options (click to expand)</summary>
+
+```yaml
+# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# This can also be a relative path to a model on disk
+base_model: ./llama-7b-hf
+# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+base_model_ignore_patterns:
+# If the base_model repo on hf hub doesn't include configuration .json files,
+# You can set that here, or leave this empty to default to base_model
+base_model_config: ./llama-7b-hf
+# You can specify to choose a specific model revision from huggingface hub
+revision_of_model:
+# Optional tokenizer configuration path in case you want to use a different tokenizer
+# than the one defined in the base model
+tokenizer_config:
+# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
+model_type: AutoModelForCausalLM
+# Corresponding tokenizer for the model AutoTokenizer is a good choice
+tokenizer_type: AutoTokenizer
+# Trust remote code for untrusted source
+trust_remote_code:
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast:
+# Whether to use the legacy tokenizer setting, defaults to True
+tokenizer_legacy:
+# Resize the model embeddings when new tokens are added to multiples of 32
+# This is reported to improve training speed on some models
+resize_token_embeddings_to_32x:
+
+# (Internal use only)
+# Used to identify which the model is based on
+is_falcon_derived_model:
+is_llama_derived_model:
+is_qwen_derived_model:
+# Please note that if you set this to true, `padding_side` will be set to "left" by default
+is_mistral_derived_model:
+
+# optional overrides to the base model configuration
+overrides_of_model_config:
+  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+  rope_scaling:
+    type: # linear | dynamic
+    factor: # float
+
+# optional overrides to the bnb 4bit quantization configuration
+# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
+bnb_config_kwargs:
+  # These are default values
+  llm_int8_has_fp16_weight: false
+  bnb_4bit_quant_type: nf4
+  bnb_4bit_use_double_quant: true
+
+
+# Whether you are training a 4-bit GPTQ quantized model
+gptq: true
+
+# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+load_in_8bit: true
+# Use bitsandbytes 4 bit
+load_in_4bit:
+
+# Use CUDA bf16
+bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
+# Use CUDA fp16
+fp16: true
+# Use CUDA tf32
+tf32: true # require >=ampere
+
+# No AMP (automatic mixed precision)
+bfloat16: true # require >=ampere
+float16: true
+
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
+gpu_memory_limit: 20GiB
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: true
+
+# A list of one or more datasets to finetune the model with
+datasets:
+  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+  - path: vicgalle/alpaca-gpt4
+  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+    data_files: # Optional[str] path to source data files
+    shards: # Optional[int] number of shards to split data into
+    name: # Optional[str] name of dataset configuration to load
+    train_on_split: train # Optional[str] name of dataset split to load from
+
+    # Optional[str] fastchat conversation type, only used with type: sharegpt
+    conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    field_human: # Optional[str]. Human key to use for conversation.
+    field_model: # Optional[str]. Assistant key to use for conversation.
+
+  # Custom user instruction prompt
+  - path: repo
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_system: system
+      field_instruction: instruction
+      field_input: input
+      field_output: output
+
+      # Customizable to be single line or multi-line
+      # Use {instruction}/{input} as key to be replaced
+      # 'format' can include {input}
+      format: |-
+        User: {instruction} {input}
+        Assistant:
+      # 'no_input_format' cannot include {input}
+      no_input_format: "{instruction} "
+
+      # For `completion` datsets only, uses the provided field instead of `text` column
+      field:
+
+# A list of one or more datasets to eval the model with.
+# You can use either test_datasets, or val_set_size, but not both.
+test_datasets:
+  - path: /workspace/data/eval.jsonl
+    ds_type: json
+    # You need to specify a split. For "json" datasets the default split is called "train".
+    split: train
+    type: completion
+    data_files:
+      - /workspace/data/eval.jsonl
+
+# use RL training: 'dpo', 'ipo', 'kto_pair'
+rl:
+
+# Saves the desired chat template to the tokenizer_config.json for easier inferencing
+# Currently supports chatml and inst (mistral/mixtral)
+chat_template: chatml
+# Changes the default system message
+default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
+# Axolotl attempts to save the dataset as an arrow after packing the data together so
+# subsequent training attempts load faster, relative path
+dataset_prepared_path: data/last_run_prepared
+# Push prepared dataset to hub
+push_dataset_to_hub: # repo path
+# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
+# if not set.
+dataset_processes: # defaults to os.cpu_count() if not set
+# Keep dataset in memory while preprocessing
+# Only needed if cached dataset is taking too much storage
+dataset_keep_in_memory:
+# push checkpoints to hub
+hub_model_id: # private repo path to push finetuned model
+# how to push checkpoints to hub
+# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+hub_strategy:
+# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# Required to be true when used in combination with `push_dataset_to_hub`
+hf_use_auth_token: # boolean
+# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
+val_set_size: 0.04
+# Num shards for whole dataset
+dataset_shard_num:
+# Index of shard to use for whole dataset
+dataset_shard_idx:
+
+# The maximum length of an input to train with, this should typically be less than 2048
+# as most models have a token/context limit of 2048
+sequence_len: 2048
+# Pad inputs so each step uses constant sized buffers
+# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+pad_to_sequence_len:
+# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+sample_packing:
+# Set to 'false' if getting errors during eval with sample_packing on.
+eval_sample_packing:
+# You can set these packing optimizations AFTER starting a training at least once.
+# The trainer will provide recommended values for these values.
+sample_packing_eff_est:
+total_num_tokens:
+
+# Passed through to transformers when loading the model when launched without accelerate
+# Use `sequential` when training w/ model parallelism to limit memory
+device_map:
+# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
+max_memory:
+
+# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+adapter: lora
+# If you already have a lora model trained that you want to load, put that here.
+# This means after training, if you want to test the model, you should set this to the value of `output_dir`.
+# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.
+lora_model_dir:
+
+# LoRA hyperparameters
+# For more details about the following options, see:
+# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+#  - k_proj
+#  - o_proj
+#  - gate_proj
+#  - down_proj
+#  - up_proj
+lora_target_linear: # If true, will target all linear modules
+peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers
+
+# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
+# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
+# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+lora_modules_to_save:
+#  - embed_tokens
+#  - lm_head
+
+lora_fan_in_fan_out: false
+
+peft:
+  # Configuration options for loftq initialization for LoRA
+  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
+  loftq_config:
+    loftq_bits:  # typically 4 bits
+
+# ReLoRA configuration
+# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+relora_steps: # Number of steps per ReLoRA restart
+relora_warmup_steps: # Number of per-restart warmup steps
+relora_anneal_steps: # Number of anneal steps for each relora cycle
+relora_prune_ratio: # threshold for optimizer magnitude when pruning
+relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
+# wandb configuration if you're using it
+# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
+wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+wandb_project: # Your wandb project name
+wandb_entity: # A wandb Team name if using a Team
+wandb_watch:
+wandb_name: # Set the name of your wandb run
+wandb_run_id: # Set the ID of your wandb run
+wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
+
+# mlflow configuration if you're using it
+mlflow_tracking_uri: # URI to mlflow
+mlflow_experiment_name: # Your experiment name
+hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry
+
+# Where to save the full-finetuned model to
+output_dir: ./completed-model
+
+# Whether to use torch.compile and which backend to use
+torch_compile:  # bool
+torch_compile_backend:  # Optional[str]
+
+# Training hyperparameters
+
+# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
+gradient_accumulation_steps: 1
+# The number of samples to include in each batch. This is the number of samples sent to each GPU.
+micro_batch_size: 2
+eval_batch_size:
+num_epochs: 4
+warmup_steps: 100  # cannot use with warmup_ratio
+warmup_ratio: 0.05  # cannot use with warmup_steps
+learning_rate: 0.00003
+lr_quadratic_warmup:
+logging_steps:
+eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
+save_strategy: # Set to `no` to skip checkpoint saves
+save_steps: # Leave empty to save at each epoch
+saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
+save_total_limit: # Checkpoints saved at a time
+# Maximum number of iterations to train for. It precedes num_epochs which means that
+# if both are set, num_epochs will not be guaranteed.
+# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+max_steps:
+
+eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
+eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]
+
+loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
+loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
+
+# Save model as safetensors (require safetensors package)
+save_safetensors:
+
+# Whether to mask out or include the human's prompt from the training labels
+train_on_inputs: false
+# Group similarly sized data to minimize padding.
+# May be slower to start, as it must download and sort the entire dataset.
+# Note that training loss may have an oscillating pattern with this enabled.
+group_by_length: false
+
+# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+gradient_checkpointing: false
+# additional kwargs to pass to the trainer for gradient checkpointing
+# gradient_checkpointing_kwargs:
+#   use_reentrant: false
+
+# Stop training after this many evaluation losses have increased in a row
+# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+early_stopping_patience: 3
+
+# Specify a scheduler and kwargs to use with the optimizer
+lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+lr_scheduler_kwargs:
+cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
+cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
+
+# For one_cycle optim
+lr_div_factor: # Learning rate div factor
+
+# Specify optimizer
+# Valid values are driven by the Transformers OptimizerNames class, see:
+# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+#
+# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# in the examples/ for your model and fine-tuning use case.
+#
+# Valid values for 'optimizer' include:
+# - adamw_hf
+# - adamw_torch
+# - adamw_torch_fused
+# - adamw_torch_xla
+# - adamw_apex_fused
+# - adafactor
+# - adamw_anyprecision
+# - sgd
+# - adagrad
+# - adamw_bnb_8bit
+# - lion_8bit
+# - lion_32bit
+# - paged_adamw_32bit
+# - paged_adamw_8bit
+# - paged_lion_32bit
+# - paged_lion_8bit
+optimizer:
+# Specify weight decay
+weight_decay:
+# adamw hyperparams
+adam_beta1:
+adam_beta2:
+adam_epsilon:
+# Gradient clipping max norm
+max_grad_norm:
+
+# Augmentation techniques
+# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
+# currently only supported on Llama and Mistral
+neftune_noise_alpha:
+
+# Whether to bettertransformers
+flash_optimum:
+# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+xformers_attention:
+# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+flash_attention:
+flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# Whether to use scaled-dot-product attention
+# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+sdp_attention:
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+s2_attention:
+# Resume from a specific checkpoint dir
+resume_from_checkpoint:
+# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# Be careful with this being turned on between different models.
+auto_resume_from_checkpoints: false
+
+# Don't mess with this, it's here for accelerate and torchrun
+local_rank:
+
+# Add or change special tokens.
+# If you add tokens here, you don't need to add them to the `tokens` list.
+special_tokens:
+  # bos_token: "<s>"
+  # eos_token: "</s>"
+  # unk_token: "<unk>"
+
+# Add extra tokens.
+tokens:
+
+# FSDP
+fsdp:
+fsdp_config:
+
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
+deepspeed:
+
+# Advanced DDP Arguments
+ddp_timeout:
+ddp_bucket_cap_mb:
+ddp_broadcast_buffers:
+
+# Path to torch distx for optim 'adamw_anyprecision'
+torchdistx_path:
+
+# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+pretraining_dataset:
+
+# Debug mode
+debug:
+
+# Seed
+seed:
+
+# Allow overwrite yml config using from cli
+strict:
+```
+
+</details>
+
+<details>
+<summary> Understanding of batch size and gradient accumulation steps </summary>
+<br/>
+Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning.
+
+This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why:
+
+1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.
+
+2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.
+
+**Example 1:**
+Micro batch size: 3
+Gradient accumulation steps: 2
+Number of GPUs: 3
+Total batch size = 3 * 2 * 3 = 18
+
+```
+| GPU 1          | GPU 2          | GPU 3          |
+|----------------|----------------|----------------|
+| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
+| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |
+|----------------|----------------|----------------|
+| → (accumulate) | → (accumulate) | → (accumulate) |
+|----------------|----------------|----------------|
+| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |
+| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |
+|----------------|----------------|----------------|
+| → (apply)      | → (apply)      | → (apply)      |
+
+Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
+Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18
+
+Weight update for w1:
+w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
+```
+
+**Example 2:**
+Micro batch size: 2
+Gradient accumulation steps: 1
+Number of GPUs: 3
+Total batch size = 2 * 1 * 3 = 6
+
+```
+| GPU 1     | GPU 2     | GPU 3     |
+|-----------|-----------|-----------|
+| S1, S2    | S3, S4    | S5, S6    |
+| e1, e2    | e3, e4    | e5, e6    |
+|-----------|-----------|-----------|
+| → (apply) | → (apply) | → (apply) |
+
+Accumulated gradient for the weight w1 (considering all GPUs):
+Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6
+
+Weight update for w1:
+w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
+```
+
+</details>

 ### Train

@@ -443,7 +1082,7 @@ fsdp_config:

 ##### FSDP + QLoRA

-Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.qmd) for more information.
+Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.md) for more information.

 ##### Weights & Biases Logging

@@ -522,7 +1161,7 @@ although this will be very slow, and using the config options above are recommen

 ## Common Errors 🧰

-See also the [FAQ's](./docs/faq.qmd) and [debugging guide](docs/debugging.qmd).
+See also the [FAQ's](./docs/faq.md) and [debugging guide](docs/debugging.md).

 > If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it:

@@ -556,7 +1195,7 @@ It's safe to ignore it.

 > NCCL Timeouts during training

-See the [NCCL](docs/nccl.qmd) guide.
+See the [NCCL](docs/nccl.md) guide.


 ### Tokenization Mismatch b/w Inference & Training
@@ -574,7 +1213,7 @@ Having misalignment between your prompts during training and inference can cause

 ## Debugging Axolotl

-See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
+See [this debugging guide](docs/debugging.md) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.

 ## Need help? 🙋

@@ -612,8 +1251,14 @@ Bugs? Please check the [open issues](https://github.com/OpenAccess-AI-Collective

 PRs are **greatly welcome**!

-Please run the quickstart instructions followed by the below to setup env:
+Please run below to setup env
 ```bash
+git clone https://github.com/OpenAccess-AI-Collective/axolotl
+cd axolotl
+
+pip3 install packaging
+pip3 install -e '.[flash-attn,deepspeed]'
+
 pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,51 +0,0 @@
-project:
-  type: website
-
-website:
-  title: "Axolotl"
-  description: "Fine-tuning"
-  favicon: favicon.jpg
-  navbar:
-    title: Axolotl
-    background: dark
-    pinned: false
-    collapse: false
-    tools:
-    - icon: twitter
-      href: https://twitter.com/axolotl_ai
-    - icon: github
-      href: https://github.com/OpenAccess-AI-Collective/axolotl/
-    - icon: discord
-      href: https://discord.gg/7m9sfhzaf3
-
-  sidebar:
-      pinned: true
-      collapse-level: 2
-      style: docked
-      contents:
-        - text: Home
-          href: index.qmd
-        - section: "How-To Guides"
-          contents:
-          # TODO Edit folder structure after we have more docs.
-            - docs/debugging.qmd
-            - docs/multipack.qmd
-            - docs/fsdp_qlora.qmd
-            - docs/input_output.qmd
-            - docs/rlhf.qmd
-            - docs/nccl.qmd
-            - docs/mac.qmd
-            - docs/multi-node.qmd
-        - section: "Dataset Formats"
-          contents: docs/dataset-formats/*
-        - section: "Reference"
-          contents:
-            - docs/config.qmd
-        - docs/faq.qmd
-
-
-format:
-  html:
-    theme: materia
-    css: styles.css
-    toc: true
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -22,11 +22,10 @@ RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

 # If AXOLOTL_EXTRAS is set, append it in brackets
-RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
@@ -1,39 +0,0 @@
-{
-  "zero_optimization": {
-    "stage": 3,
-    "offload_optimizer": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "offload_param": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 0,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
-  },
-  "bf16": {
-    "enabled": true
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
@@ -1,35 +0,0 @@
-{
-  "zero_optimization": {
-    "stage": 3,
-    "offload_param": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 0,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 0,
-    "stage3_max_reuse_distance": 0,
-    "stage3_gather_16bit_weights_on_model_save": true
-  },
-  "bf16": {
-    "enabled": true
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/devtools/README.md
+++ b/devtools/README.md
@@ -1 +1 @@
-This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
+This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information.
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,11 +20,10 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
-RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,2 +0,0 @@
-/.quarto/
-_site/
--- a/docs/batch_vs_grad.qmd
+++ b/docs/batch_vs_grad.qmd
@@ -1,59 +0,0 @@
---
-title: Batch size vs Gradient accumulation
-description: Understanding of batch size and gradient accumulation steps
---
-
-Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning.
-
-This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why:
-
-1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.
-
-2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.
-
-**Example 1:**
-Micro batch size: 3
-Gradient accumulation steps: 2
-Number of GPUs: 3
-Total batch size = 3 * 2 * 3 = 18
-
-```
-| GPU 1          | GPU 2          | GPU 3          |
-|----------------|----------------|----------------|
-| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
-| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |
-|----------------|----------------|----------------|
-| → (accumulate) | → (accumulate) | → (accumulate) |
-|----------------|----------------|----------------|
-| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |
-| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |
-|----------------|----------------|----------------|
-| → (apply)      | → (apply)      | → (apply)      |
-
-Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
-Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18
-
-Weight update for w1:
-w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
-```
-
-**Example 2:**
-Micro batch size: 2
-Gradient accumulation steps: 1
-Number of GPUs: 3
-Total batch size = 2 * 1 * 3 = 6
-
-```
-| GPU 1     | GPU 2     | GPU 3     |
-|-----------|-----------|-----------|
-| S1, S2    | S3, S4    | S5, S6    |
-| e1, e2    | e3, e4    | e5, e6    |
-|-----------|-----------|-----------|
-| → (apply) | → (apply) | → (apply) |
-
-Accumulated gradient for the weight w1 (considering all GPUs):
-Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6
-
-Weight update for w1:
-w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
-```
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -1,445 +0,0 @@
---
-title: Config options
-description: A complete list of all configuration options.
---
-
-```yaml
-# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# This can also be a relative path to a model on disk
-base_model: ./llama-7b-hf
-# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
-base_model_ignore_patterns:
-# If the base_model repo on hf hub doesn't include configuration .json files,
-# You can set that here, or leave this empty to default to base_model
-base_model_config: ./llama-7b-hf
-# You can specify to choose a specific model revision from huggingface hub
-revision_of_model:
-# Optional tokenizer configuration path in case you want to use a different tokenizer
-# than the one defined in the base model
-tokenizer_config:
-# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
-model_type: AutoModelForCausalLM
-# Corresponding tokenizer for the model AutoTokenizer is a good choice
-tokenizer_type: AutoTokenizer
-# Trust remote code for untrusted source
-trust_remote_code:
-# use_fast option for tokenizer loading from_pretrained, default to True
-tokenizer_use_fast:
-# Whether to use the legacy tokenizer setting, defaults to True
-tokenizer_legacy:
-# Resize the model embeddings when new tokens are added to multiples of 32
-# This is reported to improve training speed on some models
-resize_token_embeddings_to_32x:
-
-# (Internal use only)
-# Used to identify which the model is based on
-is_falcon_derived_model:
-is_llama_derived_model:
-is_qwen_derived_model:
-# Please note that if you set this to true, `padding_side` will be set to "left" by default
-is_mistral_derived_model:
-
-# optional overrides to the base model configuration
-overrides_of_model_config:
-  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
-  rope_scaling:
-    type: # linear | dynamic
-    factor: # float
-
-# optional overrides to the bnb 4bit quantization configuration
-# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
-bnb_config_kwargs:
-  # These are default values
-  llm_int8_has_fp16_weight: false
-  bnb_4bit_quant_type: nf4
-  bnb_4bit_use_double_quant: true
-
-
-# Whether you are training a 4-bit GPTQ quantized model
-gptq: true
-
-# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: true
-# Use bitsandbytes 4 bit
-load_in_4bit:
-
-# Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
-# Use CUDA fp16
-fp16: true
-# Use CUDA tf32
-tf32: true # require >=ampere
-
-# No AMP (automatic mixed precision)
-bfloat16: true # require >=ampere
-float16: true
-
-# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
-gpu_memory_limit: 20GiB
-# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
-lora_on_cpu: true
-
-# A list of one or more datasets to finetune the model with
-datasets:
-  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
-  - path: vicgalle/alpaca-gpt4
-  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
-    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
-    data_files: # Optional[str] path to source data files
-    shards: # Optional[int] number of shards to split data into
-    name: # Optional[str] name of dataset configuration to load
-    train_on_split: train # Optional[str] name of dataset split to load from
-
-    # Optional[str] fastchat conversation type, only used with type: sharegpt
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].
-
-  # Custom user instruction prompt
-  - path: repo
-    type:
-      # The below are defaults. only set what's needed if you use a different column name.
-      system_prompt: ""
-      system_format: "{system}"
-      field_system: system
-      field_instruction: instruction
-      field_input: input
-      field_output: output
-
-      # Customizable to be single line or multi-line
-      # Use {instruction}/{input} as key to be replaced
-      # 'format' can include {input}
-      format: |-
-        User: {instruction} {input}
-        Assistant:
-      # 'no_input_format' cannot include {input}
-      no_input_format: "{instruction} "
-
-      # For `completion` datsets only, uses the provided field instead of `text` column
-      field:
-
-# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
-# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
-shuffle_merged_datasets: true
-
-# A list of one or more datasets to eval the model with.
-# You can use either test_datasets, or val_set_size, but not both.
-test_datasets:
-  - path: /workspace/data/eval.jsonl
-    ds_type: json
-    # You need to specify a split. For "json" datasets the default split is called "train".
-    split: train
-    type: completion
-    data_files:
-      - /workspace/data/eval.jsonl
-
-# use RL training: 'dpo', 'ipo', 'kto_pair'
-rl:
-
-# Saves the desired chat template to the tokenizer_config.json for easier inferencing
-# Currently supports chatml and inst (mistral/mixtral)
-chat_template: chatml
-# Changes the default system message
-default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
-# Axolotl attempts to save the dataset as an arrow after packing the data together so
-# subsequent training attempts load faster, relative path
-dataset_prepared_path: data/last_run_prepared
-# Push prepared dataset to hub
-push_dataset_to_hub: # repo path
-# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
-# if not set.
-dataset_processes: # defaults to os.cpu_count() if not set
-# Keep dataset in memory while preprocessing
-# Only needed if cached dataset is taking too much storage
-dataset_keep_in_memory:
-# push checkpoints to hub
-hub_model_id: # private repo path to push finetuned model
-# how to push checkpoints to hub
-# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
-hub_strategy:
-# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
-# Required to be true when used in combination with `push_dataset_to_hub`
-hf_use_auth_token: # boolean
-# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
-val_set_size: 0.04
-# Num shards for whole dataset
-dataset_shard_num:
-# Index of shard to use for whole dataset
-dataset_shard_idx:
-
-# The maximum length of an input to train with, this should typically be less than 2048
-# as most models have a token/context limit of 2048
-sequence_len: 2048
-# Pad inputs so each step uses constant sized buffers
-# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
-pad_to_sequence_len:
-# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
-sample_packing:
-# Set to 'false' if getting errors during eval with sample_packing on.
-eval_sample_packing:
-# You can set these packing optimizations AFTER starting a training at least once.
-# The trainer will provide recommended values for these values.
-sample_packing_eff_est:
-total_num_tokens:
-
-# Passed through to transformers when loading the model when launched without accelerate
-# Use `sequential` when training w/ model parallelism to limit memory
-device_map:
-# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
-max_memory:
-
-# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
-adapter: lora
-# If you already have a lora model trained that you want to load, put that here.
-# This means after training, if you want to test the model, you should set this to the value of `output_dir`.
-# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.
-lora_model_dir:
-
-# LoRA hyperparameters
-# For more details about the following options, see:
-# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-#  - gate_proj
-#  - down_proj
-#  - up_proj
-lora_target_linear: # If true, will target all linear modules
-peft_layers_to_transform: # The layer indices to transform, otherwise, apply to all layers
-
-# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
-# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
-# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
-# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
-lora_modules_to_save:
-#  - embed_tokens
-#  - lm_head
-
-lora_fan_in_fan_out: false
-
-peft:
-  # Configuration options for loftq initialization for LoRA
-  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
-  loftq_config:
-    loftq_bits:  # typically 4 bits
-
-# ReLoRA configuration
-# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
-relora_steps: # Number of steps per ReLoRA restart
-relora_warmup_steps: # Number of per-restart warmup steps
-relora_anneal_steps: # Number of anneal steps for each relora cycle
-relora_prune_ratio: # threshold for optimizer magnitude when pruning
-relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
-
-# wandb configuration if you're using it
-# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
-wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
-wandb_project: # Your wandb project name
-wandb_entity: # A wandb Team name if using a Team
-wandb_watch:
-wandb_name: # Set the name of your wandb run
-wandb_run_id: # Set the ID of your wandb run
-wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
-
-# mlflow configuration if you're using it
-mlflow_tracking_uri: # URI to mlflow
-mlflow_experiment_name: # Your experiment name
-hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry
-
-# Where to save the full-finetuned model to
-output_dir: ./completed-model
-
-# Whether to use torch.compile and which backend to use
-torch_compile:  # bool
-torch_compile_backend:  # Optional[str]
-
-# Training hyperparameters
-
-# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
-gradient_accumulation_steps: 1
-# The number of samples to include in each batch. This is the number of samples sent to each GPU.
-micro_batch_size: 2
-eval_batch_size:
-num_epochs: 4
-warmup_steps: 100  # cannot use with warmup_ratio
-warmup_ratio: 0.05  # cannot use with warmup_steps
-learning_rate: 0.00003
-lr_quadratic_warmup:
-logging_steps:
-eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
-evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
-save_strategy: # Set to `no` to skip checkpoint saves
-save_steps: # Leave empty to save at each epoch
-saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
-save_total_limit: # Checkpoints saved at a time
-# Maximum number of iterations to train for. It precedes num_epochs which means that
-# if both are set, num_epochs will not be guaranteed.
-# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
-max_steps:
-
-eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
-eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]
-
-loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
-loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
-
-# Save model as safetensors (require safetensors package)
-save_safetensors:
-
-# Whether to mask out or include the human's prompt from the training labels
-train_on_inputs: false
-# Group similarly sized data to minimize padding.
-# May be slower to start, as it must download and sort the entire dataset.
-# Note that training loss may have an oscillating pattern with this enabled.
-group_by_length: false
-
-# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
-gradient_checkpointing: false
-# additional kwargs to pass to the trainer for gradient checkpointing
-# gradient_checkpointing_kwargs:
-#   use_reentrant: true
-
-# Stop training after this many evaluation losses have increased in a row
-# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-early_stopping_patience: 3
-
-# Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
-lr_scheduler_kwargs:
-cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
-cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
-
-# For one_cycle optim
-lr_div_factor: # Learning rate div factor
-
-# Specify optimizer
-# Valid values are driven by the Transformers OptimizerNames class, see:
-# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
-#
-# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
-# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
-# in the examples/ for your model and fine-tuning use case.
-#
-# Valid values for 'optimizer' include:
-# - adamw_hf
-# - adamw_torch
-# - adamw_torch_fused
-# - adamw_torch_xla
-# - adamw_apex_fused
-# - adafactor
-# - adamw_anyprecision
-# - sgd
-# - adagrad
-# - adamw_bnb_8bit
-# - lion_8bit
-# - lion_32bit
-# - paged_adamw_32bit
-# - paged_adamw_8bit
-# - paged_lion_32bit
-# - paged_lion_8bit
-# - galore_adamw
-# - galore_adamw_8bit
-# - galore_adafactor
-# - galore_adamw_layerwise
-# - galore_adamw_8bit_layerwise
-# - galore_adafactor_layerwise
-optimizer:
-# Dictionary of arguments to pass to the optimizer
-optim_args:
-# For Galore Optimizers the following optim_args are available
-# rank:  # type: int
-# update_proj_gap  # type: int
-# scale  # type: float
-# proj_type:  # type: str, default = std
-
-# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
-optim_target_modules:
-# - self_attn  # for llama
-# - mlp
-
-# Specify weight decay
-weight_decay:
-# adamw hyperparams
-adam_beta1:
-adam_beta2:
-adam_epsilon:
-# Gradient clipping max norm
-max_grad_norm:
-
-# Augmentation techniques
-# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
-# currently only supported on Llama and Mistral
-neftune_noise_alpha:
-
-# Whether to bettertransformers
-flash_optimum:
-# Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
-xformers_attention:
-# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
-flash_attention:
-flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
-flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
-flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
-# Whether to use scaled-dot-product attention
-# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-sdp_attention:
-# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
-s2_attention:
-# Resume from a specific checkpoint dir
-resume_from_checkpoint:
-# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
-# Be careful with this being turned on between different models.
-auto_resume_from_checkpoints: false
-
-# Don't mess with this, it's here for accelerate and torchrun
-local_rank:
-
-# Add or change special tokens.
-# If you add tokens here, you don't need to add them to the `tokens` list.
-special_tokens:
-  # bos_token: "<s>"
-  # eos_token: "</s>"
-  # unk_token: "<unk>"
-
-# Add extra tokens.
-tokens:
-
-# FSDP
-fsdp:
-fsdp_config:
-
-# Deepspeed config path. e.g., deepspeed_configs/zero3.json
-deepspeed:
-
-# Advanced DDP Arguments
-ddp_timeout:
-ddp_bucket_cap_mb:
-ddp_broadcast_buffers:
-
-# Path to torch distx for optim 'adamw_anyprecision'
-torchdistx_path:
-
-# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
-pretraining_dataset:
-
-# Debug mode
-debug:
-
-# Seed
-seed:
-
-# Allow overwrite yml config using from cli
-strict:
-```
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -1,63 +0,0 @@
---
-title: Conversation
-description: Conversation format for supervised fine-tuning.
-order: 3
---
-
-## sharegpt
-
-conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"from": "...", "value": "..."}]}
-```
-
-Note: `type: sharegpt` opens special configs:
- `conversation`: enables conversions to many Conversation types. Refer to the 'name' [here](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) for options.
- `roles`: allows you to specify the roles for input and output. This is useful for datasets with custom roles such as `tool` etc to support masking.
- `field_human`: specify the key to use instead of `human` in the conversation.
- `field_model`: specify the key to use instead of `gpt` in the conversation.
-
-```yaml
-datasets:
-    path: ...
-    type: sharegpt
-
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-    field_human: # Optional[str]. Human key to use for conversation.
-    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].
-```
-
-## pygmalion
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "value": "..."}]}
-```
-
-## sharegpt.load_role
-
-conversations where `role` is used instead of `from`
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "value": "..."}]}
-```
-
-## sharegpt.load_guanaco
-
-conversations where `from` is `prompter` `assistant` instead of default sharegpt
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"from": "...", "value": "..."}]}
-```
-
-## sharegpt_jokes
-
-creates a chat where bot is asked to tell a joke, then explain why the joke is funny
-
-```{.json filename="data.jsonl"}
-{"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
-```
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -1,14 +0,0 @@
---
-title: Dataset Formats
-description: Supported dataset formats.
-listing:
-  fields: [title, description]
-  type: table
-  sort-ui: false
-  filter-ui: false
-  max-description-length: 250
---
-
-Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL format.  The schema of the JSONL depends upon the task and the prompt template you wish to use. Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.
-
-Below are these various formats organized by task:
--- a/docs/dataset-formats/inst_tune.qmd
+++ b/docs/dataset-formats/inst_tune.qmd
@@ -1,189 +0,0 @@
---
-title: Instruction Tuning
-description: Instruction tuning formats for supervised fine-tuning.
-order: 2
---
-
-## alpaca
-
-instruction; input(optional)
-
-```{.json filename="data.jsonl"}
-{"instruction": "...", "input": "...", "output": "..."}
-```
-
-## jeopardy
-
-question and answer
-
-```{.json filename="data.jsonl"}
-{"question": "...", "category": "...", "answer": "..."}
-```
-
-## oasst
-
-instruction
-
-```{.json filename="data.jsonl"}
-{"INSTRUCTION": "...", "RESPONSE": "..."}
-```
-
-## gpteacher
-
-instruction; input(optional)
-
-```{.json filename="data.jsonl"}
-{"instruction": "...", "input": "...", "response": "..."}
-```
-
-## reflection
-
-instruction with reflect; input(optional)
-
-```{.json filename="data.jsonl"}
-{"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
-```
-
-## explainchoice
-
-question, choices, (solution OR explanation)
-
-```{.json filename="data.jsonl"}
-{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
-```
-
-## concisechoice
-
-question, choices, (solution OR explanation)
-
-```{.json filename="data.jsonl"}
-{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
-```
-
-## summarizetldr
-
-article and summary
-
-```{.json filename="data.jsonl"}
-{"article": "...", "summary": "..."}
-```
-
-## alpaca_chat
-
-basic instruct for alpaca chat
-
-```{.json filename="data.jsonl"}
-{"instruction": "...", "input": "...", "response": "..."}
-```
-
-## alpaca_chat.load_qa
-
-question and answer for alpaca chat
-
-```{.json filename="data.jsonl"}
-{"question": "...", "answer": "..."}
-```
-
-## alpaca_chat.load_concise
-
-question and answer for alpaca chat, for concise answers
-
-```{.json filename="data.jsonl"}
-{"instruction": "...", "input": "...", "response": "..."}
-```
-
-## alpaca_chat.load_camel_ai
-
-question and answer for alpaca chat, for load_camel_ai
-
-```{.json filename="data.jsonl"}
-{"message_1": "...", "message_2": "..."}
-```
-
-## alpaca_w_system.load_open_orca
-
-support for open orca datasets with included system prompts, instruct
-
-```{.json filename="data.jsonl"}
-{"system_prompt": "...", "question": "...", "response": "..."}
-```
-
-## context_qa
-
-in context question answering from an article
-
-```{.json filename="data.jsonl"}
-{"article": "...", "question": "...", "answer": "..."}
-```
-
-## context_qa.load_v2
-
-in context question answering (alternate)
-
-```{.json filename="data.jsonl"}
-{"context": "...", "question": "...", "answer": "..."}
-```
-
-## context_qa.load_404
-
-in context question answering from an article, with default response for no answer from context
-
-```{.json filename="data.jsonl"}
-{"article": "...", "unanswerable_question": "..."}
-```
-
-## creative_acr.load_answer
-
-instruction and revision
-
-```{.json filename="data.jsonl"}
-{"instruction": "...", "revision": "..."}
-```
-
-## creative_acr.load_critique
-
-critique
-
-```{.json filename="data.jsonl"}
-{"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
-```
-
-## creative_acr.load_revise
-
-critique and revise
-
-```{.json filename="data.jsonl"}
-{"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
-```
-
-## metharme
-
-instruction, adds additional eos tokens
-
-```{.json filename="data.jsonl"}
-{"prompt": "...", "generation": "..."}
-```
-
-## How to add custom prompt format
-
-For a dataset that is preprocessed for instruction purposes:
-
-```{.json filename="data.jsonl"}
-{"input": "...", "output": "..."}
-```
-
-You can use this example in your YAML config:
-
-```{.yaml filename="config.yaml"}
-datasets:
-  - path: repo
-    type:
-      system_prompt: ""
-      field_system: system
-      field_instruction: input
-      field_output: output
-      format: "[INST] {instruction} [/INST]"
-      no_input_format: "[INST] {instruction} [/INST]"
-```
-
-See full config options under [here](../config.qmd).
--- a/docs/dataset-formats/pretraining.qmd
+++ b/docs/dataset-formats/pretraining.qmd
@@ -1,26 +0,0 @@
---
-title: Pre-training
-description: Data format for a pre-training completion task.
-order: 1
---
-
-For pretraining, there is no prompt template or roles.  The only required field is `text`:
-
-```{.json filename="data.jsonl"}
-{"text": "first row"}
-{"text": "second row"}
-...
-```
-
-:::{.callout-note}
-
-### Streaming is recommended for large datasets
-
-Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
-
-```{.yaml filename="config.yaml"}
-pretraining_dataset: # hf path only
-...
-```
-
-:::
--- a/docs/dataset-formats/template_free.qmd
+++ b/docs/dataset-formats/template_free.qmd
@@ -1,7 +0,0 @@
---
-title: Template-Free
-description: Construct prompts without a template.
-order: 4
---
-
-See [these docs](../input_output.qmd).
--- a/docs/dataset-formats/tokenized.qmd
+++ b/docs/dataset-formats/tokenized.qmd
@@ -1,12 +0,0 @@
---
-title: Custom Pre-Tokenized Dataset
-description: How to use a custom pre-tokenized dataset.
-order: 5
---
-
- Do not pass a `type:` in your axolotl config.
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
-
-```{.yaml filename="config.yml"}
- path: ...
-```
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -1,8 +1,4 @@
---
-title: Debugging
-description: How to debug Axolotl
---
-
+# Debugging Axolotl

 This document provides some tips and tricks for debugging Axolotl.  It also provides an example configuration for debugging with VSCode.  A good debugging setup is essential to understanding how Axolotl code works behind the scenes.

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -0,0 +1,18 @@
+# Axolotl FAQ's
+
+
+> The trainer stopped and hasn't progressed in several minutes.
+
+Usually an issue with the GPU's communicating with each other. See the [NCCL doc](../docs/nccl.md)
+
+> Exitcode -9
+
+This usually happens when you run out of system RAM.
+
+> Exitcode -7 while using deepspeed
+
+Try upgrading deepspeed w: `pip install -U deepspeed`
+
+> AttributeError: 'DummyOptim' object has no attribute 'step'
+
+You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -1,21 +0,0 @@
---
-title: FAQ
-description: Frequently asked questions
---
-
-
-**Q: The trainer stopped and hasn't progressed in several minutes.**
-
-> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)
-
-**Q: Exitcode -9**
-
-> A: This usually happens when you run out of system RAM.
-
-**Q: Exitcode -7 while using deepspeed**
-
-> A: Try upgrading deepspeed w: `pip install -U deepspeed`
-
-**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
-
-> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -1,10 +1,4 @@
---
-title: "FDSP + QLoRA"
-description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
-format:
-  html:
-    toc: true
---
+# FDSP + QLoRA

 ## Background

--- a/docs/input_output.qmd
+++ b/docs/input_output.qmd
@@ -1,7 +1,4 @@
---
-title: Template-free prompt construction
-description: "Template-free prompt construction with the `input_output` format"
---
+# Template-free prompt construction with the `input_output` format

 <!-- TOC -->

@@ -43,7 +40,7 @@ labels so that your model can focus on predicting the outputs only.
 ### You may not want prompt templates

 However, there are many situations where you don't want to use one of
-these formats or templates. This is because they can:
+these formats or templates (I usually don't!). This is because they can:

 -   Add unnecessary boilerplate to your prompts.
 -   Create artifacts like special delimiters `<|im_start|>` that can
@@ -91,9 +88,8 @@ format into a jsonl file (below is the first row from the file

 ```bash
 $ head -n1 output.jsonl | python -m json.tool
-```

-:::{.cell-output .cell-output-stdout}
+{.cell-output .cell-output-stdout}
    {
        "segments": [
            {
@@ -114,7 +110,7 @@ $ head -n1 output.jsonl | python -m json.tool
            }
        ]
    }
-:::
+```

 Set `label:false` when you want to mask a segment of text so that the
 model isn't trained on it. Some things to keep in mind:
@@ -239,9 +235,8 @@ version is repeated below for reference):

 ```bash
 $ head -n1 output.jsonl | python -m json.tool
-```

-:::{.cell-output .cell-output-stdout}
+{.cell-output .cell-output-stdout}
    {
        "segments": [
            {
@@ -262,4 +257,4 @@ $ head -n1 output.jsonl | python -m json.tool
            }
        ]
    }
-:::
+```
--- a/docs/mac.qmd
+++ b/docs/mac.qmd
@@ -1,12 +1,8 @@
---
-title: Mac M-series
-description: Mac M-series support
---
+# Mac M series support

 Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.

 Current support:
-
 - [x] Support for all models
 - [x] Full training of models
 - [x] LoRA training
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -1,7 +1,4 @@
---
-title: Multi Node
-description: How to use Axolotl on multiple machines
---
+# Multi Node

 You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:

--- a/docs/multipack.qmd
+++ b/docs/multipack.qmd
@@ -1,7 +1,4 @@
---
-title: Multipack (Sample Packing)
-description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput.
---
+# Multipack (Sample Packing)

 ## Visualization of Multipack with Flash Attention

--- a/docs/nccl.qmd
+++ b/docs/nccl.qmd
@@ -1,7 +1,4 @@
---
-title: NCCL
-description: Troubleshooting NCCL issues
---
+# NCCL

 NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:

--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -1,7 +1,4 @@
---
-title: "RLHF (Beta)"
-description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback."
---
+# RLHF (Beta)

 ### Overview

@@ -37,21 +34,6 @@ datasets:
 rl: ipo
 ```

-#### ORPO
-
-Paper: https://arxiv.org/abs/2403.07691
-
-```yaml
-rl: orpo
-orpo_alpha: 0.1
-remove_unused_columns: false
-
-chat_template: chatml
-datasets:
-  - path: argilla/ultrafeedback-binarized-preferences-cleaned
-    type: orpo.chat_template
-```
-
 #### Using local dataset files
 ```yaml
 datasets:
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -21,8 +21,7 @@ lora_dropout: 0.05
 lora_target_linear: true

 sequence_len: 4096
-sample_packing: true
-eval_sample_packing: false
+sample_packing: false
 pad_to_sequence_len: true

 wandb_project:
--- a/examples/jamba/README.md
+++ b/examples/jamba/README.md
@@ -1,10 +0,0 @@
-# Jamba
-
- ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and
-  - 35GiB VRAM per GPU w minimal context length
-  - 56GiB VRAM per GPU (w multipack enabled)
- ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
- ✅ qlora single-gpu, ~51GiB VRAM
- ✅ multipack
- ❓ FSDP
- ❓ 8-bit LoRA
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -1,62 +0,0 @@
-base_model: ai21labs/Jamba-v0.1
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.0
-output_dir: ./out
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: false
-eval_sample_packing: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-adapter: qlora
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-low_cpu_mem_usage: true
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 2
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.00001
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-special_tokens:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -1,62 +0,0 @@
-base_model: ai21labs/Jamba-v0.1
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.0
-output_dir: ./out
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: false
-eval_sample_packing: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-adapter: qlora
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-low_cpu_mem_usage: true
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 2
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.00001
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch:
-saves_per_epoch: 1
-debug:
-deepspeed: deepspeed_configs/zero2.json
-weight_decay: 0.0
-special_tokens:
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -1,75 +0,0 @@
-base_model: NousResearch/Llama-2-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: teknium/GPT4-LLM-Cleaned
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./lisa-out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-adapter:
-lora_model_dir:
-lora_r:
-lora_alpha:
-lora_dropout:
-lora_target_linear:
-lora_fan_in_fan_out:
-
-lisa_n_layers: 4
-lisa_step_interval: 20
-lisa_layers_attribute: model.layers
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 5e-5 # recommendation from lisa paper for 7b
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-flash_attn_cross_entropy: false
-flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
-flash_attn_fuse_mlp: true
-
-warmup_steps: 100
-evals_per_epoch: 4
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.1
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -36,7 +36,7 @@ wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 4
 num_epochs: 4
-optimizer: adamw_torch
+optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.00001

@@ -66,11 +66,5 @@ weight_decay: 0.0
 fsdp:
  - full_shard
 fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  fsdp_state_dict_type: SHARDED_STATE_DICT
 special_tokens:
--- a/examples/mistral/Mistral-7b-example/README.md
+++ b/examples/mistral/Mistral-7b-example/README.md
@@ -0,0 +1,12 @@
+# Description
+This repository presents an in-depth guide for fine-tuning Mistral-7b or any other compatible model using Axolotl, tailored specifically for chatbot development. It streamlines the process of fine-tuning and uploading the enhanced model to HuggingFace 🤗, thereby serving as an invaluable tool for developers in the AI and chatbot domain.
+
+**What’s Inside:**
+
+Beginner-Friendly Instructions: Comprehensive steps to guide you through fine-tuning your chosen model, including details on the data structure (jsonl), configuration, and the code itself.
+
+Hardware Utilized: For reference, the fine-tuning in this guide was performed using 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel).
+
+**Uploading to HuggingFace 🤗:**
+To upload your fine-tuned model to Hugging Face, include the following files:
+![Screenshot 2024-01-19 213932](https://github.com/OpenAccess-AI-Collective/axolotl/assets/138583191/d660eb84-2d76-46a1-9846-cf0aeb3006d9)
--- a/examples/mistral/Mistral-7b-example/code.ipynb
+++ b/examples/mistral/Mistral-7b-example/code.ipynb
@@ -0,0 +1,970 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3fe31229-8f6b-48bc-a86d-af8e5466d11c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GPU available? True\n",
+      "BF16 is supported? True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check if GPU is available I used 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel)\n",
+    "import torch\n",
+    "print('GPU available?', torch.cuda.is_available())\n",
+    "print('BF16 is supported?', torch.cuda.is_bf16_supported())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1dee845b-f3cb-4b1e-bdd9-1a918eac140b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting huggingface_hub\n",
+      "  Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.7.1)\n",
+      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n",
+      "Downloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: huggingface_hub\n",
+      "Successfully installed huggingface_hub-0.20.1\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "88731672-9050-4034-8266-11aaace2a44e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6b5aa7d7-3b18-4c14-afd4-043c2c545259",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "60df98d7b0294289aad8b6c8cd023c3b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#Login to huggingface so you can push the model to hub later\n",
+    "import sys\n",
+    "stdout = sys.stdout\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b74d0635-5033-4494-b7bd-ff6822103d93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#I noticed that when you use notebook_login() nothing gets printed after so we use sys \n",
+    "sys.stdout = stdout"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e3c3b088-45e7-484b-ae39-66beabc48da8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cloning into 'axolotl'...\n",
+      "remote: Enumerating objects: 235, done.\u001b[K\n",
+      "remote: Counting objects: 100% (235/235), done.\u001b[K\n",
+      "remote: Compressing objects: 100% (207/207), done.\u001b[K\n",
+      "remote: Total 235 (delta 48), reused 123 (delta 13), pack-reused 0\u001b[K\n",
+      "Receiving objects: 100% (235/235), 1.46 MiB | 11.65 MiB/s, done.\n",
+      "Resolving deltas: 100% (48/48), done.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#axolotl\n",
+    "!git clone -b main --depth 1 https://github.com/OpenAccess-AI-Collective/axolotl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "66927751-4fd6-4477-97fc-6ab08c9d9a74",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/axolotl\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd axolotl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fcccf8da-353b-4d70-8f55-5cfe08c7e6b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mObtaining file:///axolotl\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting auto-gptq==0.5.1\n",
+      "  Downloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
+      "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
+      "Collecting peft==0.6.0\n",
+      "  Downloading peft-0.6.0-py3-none-any.whl.metadata (23 kB)\n",
+      "Collecting transformers==4.36.2\n",
+      "  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.8/126.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting tokenizers==0.15.0\n",
+      "  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
+      "Collecting bitsandbytes>=0.41.1\n",
+      "  Downloading bitsandbytes-0.41.3.post2-py3-none-any.whl.metadata (9.8 kB)\n",
+      "Collecting accelerate==0.24.1\n",
+      "  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting addict\n",
+      "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
+      "Collecting fire\n",
+      "  Downloading fire-0.5.0.tar.gz (88 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.3/88.3 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.10/site-packages (6.0.1)\n",
+      "Collecting datasets>=2.15.0\n",
+      "  Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting sentencepiece\n",
+      "  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting wandb\n",
+      "  Downloading wandb-0.16.1-py3-none-any.whl.metadata (9.8 kB)\n",
+      "Collecting einops\n",
+      "  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)\n",
+      "Collecting optimum==1.13.2\n",
+      "  Downloading optimum-1.13.2.tar.gz (300 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m72.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting hf_transfer\n",
+      "  Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting colorama\n",
+      "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+      "Collecting numba\n",
+      "  Downloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n",
+      "Requirement already satisfied: numpy>=1.24.4 in /opt/conda/lib/python3.10/site-packages (1.26.0)\n",
+      "Collecting bert-score==0.3.13\n",
+      "  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting evaluate==0.4.0\n",
+      "  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m26.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rouge-score==0.1.2\n",
+      "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting scipy\n",
+      "  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.4/60.4 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting scikit-learn==1.2.2\n",
+      "  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
+      "\u001b[?25hCollecting pynvml\n",
+      "  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting art\n",
+      "  Downloading art-6.1-py3-none-any.whl.metadata (69 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.9/69.9 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting fschat==0.2.34\n",
+      "  Downloading fschat-0.2.34-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting gradio==3.50.2\n",
+      "  Downloading gradio-3.50.2-py3-none-any.whl.metadata (17 kB)\n",
+      "Collecting tensorboard\n",
+      "  Downloading tensorboard-2.15.1-py3-none-any.whl.metadata (1.7 kB)\n",
+      "Collecting s3fs\n",
+      "  Downloading s3fs-2023.12.2-py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting gcsfs\n",
+      "  Downloading gcsfs-2023.12.2.post1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting xformers==0.0.23\n",
+      "  Downloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)\n",
+      "Collecting deepspeed\n",
+      "  Downloading deepspeed-0.12.6.tar.gz (1.2 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m109.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting flash-attn==2.3.3\n",
+      "  Downloading flash_attn-2.3.3.tar.gz (2.3 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m111.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (5.9.0)\n",
+      "Requirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (2.1.1)\n",
+      "Requirement already satisfied: huggingface-hub in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (0.20.1)\n",
+      "Collecting rouge (from auto-gptq==0.5.1)\n",
+      "  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)\n",
+      "Collecting gekko (from auto-gptq==0.5.1)\n",
+      "  Downloading gekko-1.0.6-py3-none-any.whl (12.2 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m77.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hCollecting safetensors (from auto-gptq==0.5.1)\n",
+      "  Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from auto-gptq==0.5.1) (4.65.0)\n",
+      "Collecting pandas>=1.0.1 (from bert-score==0.3.13)\n",
+      "  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from bert-score==0.3.13) (2.31.0)\n",
+      "Collecting matplotlib (from bert-score==0.3.13)\n",
+      "  Downloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
+      "Collecting dill (from evaluate==0.4.0)\n",
+      "  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n",
+      "Collecting xxhash (from evaluate==0.4.0)\n",
+      "  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+      "Collecting multiprocess (from evaluate==0.4.0)\n",
+      "  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n",
+      "Requirement already satisfied: fsspec>=2021.05.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]>=2021.05.0->evaluate==0.4.0) (2023.10.0)\n",
+      "Collecting responses<0.19 (from evaluate==0.4.0)\n",
+      "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
+      "Collecting ninja (from flash-attn==2.3.3)\n",
+      "  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)\n",
+      "Collecting aiohttp (from fschat==0.2.34)\n",
+      "  Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n",
+      "Collecting fastapi (from fschat==0.2.34)\n",
+      "  Downloading fastapi-0.108.0-py3-none-any.whl.metadata (24 kB)\n",
+      "Collecting httpx (from fschat==0.2.34)\n",
+      "  Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)\n",
+      "Collecting markdown2[all] (from fschat==0.2.34)\n",
+      "  Downloading markdown2-2.4.12-py2.py3-none-any.whl.metadata (2.0 kB)\n",
+      "Collecting nh3 (from fschat==0.2.34)\n",
+      "  Downloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)\n",
+      "Requirement already satisfied: prompt-toolkit>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from fschat==0.2.34) (3.0.36)\n",
+      "Collecting pydantic<2,>=1 (from fschat==0.2.34)\n",
+      "  Downloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (149 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rich>=10.0.0 (from fschat==0.2.34)\n",
+      "  Downloading rich-13.7.0-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting shortuuid (from fschat==0.2.34)\n",
+      "  Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)\n",
+      "Collecting tiktoken (from fschat==0.2.34)\n",
+      "  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting uvicorn (from fschat==0.2.34)\n",
+      "  Downloading uvicorn-0.25.0-py3-none-any.whl.metadata (6.4 kB)\n",
+      "Collecting aiofiles<24.0,>=22.0 (from gradio==3.50.2)\n",
+      "  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n",
+      "Collecting altair<6.0,>=4.2.0 (from gradio==3.50.2)\n",
+      "  Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)\n",
+      "Collecting ffmpy (from gradio==3.50.2)\n",
+      "  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting gradio-client==0.6.1 (from gradio==3.50.2)\n",
+      "  Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)\n",
+      "Collecting importlib-resources<7.0,>=1.3 (from gradio==3.50.2)\n",
+      "  Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)\n",
+      "Requirement already satisfied: jinja2<4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (3.1.2)\n",
+      "Requirement already satisfied: markupsafe~=2.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (2.1.1)\n",
+      "Collecting orjson~=3.0 (from gradio==3.50.2)\n",
+      "  Downloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: pillow<11.0,>=8.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (10.0.1)\n",
+      "Collecting pydub (from gradio==3.50.2)\n",
+      "  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
+      "Collecting python-multipart (from gradio==3.50.2)\n",
+      "  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting semantic-version~=2.0 (from gradio==3.50.2)\n",
+      "  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
+      "Requirement already satisfied: typing-extensions~=4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (4.7.1)\n",
+      "Collecting websockets<12.0,>=10.0 (from gradio==3.50.2)\n",
+      "  Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting coloredlogs (from optimum==1.13.2)\n",
+      "  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from optimum==1.13.2) (1.11.1)\n",
+      "Collecting absl-py (from rouge-score==0.1.2)\n",
+      "  Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)\n",
+      "Collecting nltk (from rouge-score==0.1.2)\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m90.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: six>=1.14.0 in /opt/conda/lib/python3.10/site-packages (from rouge-score==0.1.2) (1.16.0)\n",
+      "Collecting joblib>=1.1.1 (from scikit-learn==1.2.2)\n",
+      "  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n",
+      "Collecting threadpoolctl>=2.0.0 (from scikit-learn==1.2.2)\n",
+      "  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers==4.36.2) (3.9.0)\n",
+      "Collecting regex!=2019.12.17 (from transformers==4.36.2)\n",
+      "  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate==0.24.1) (3.1)\n",
+      "Collecting pyarrow>=8.0.0 (from datasets>=2.15.0)\n",
+      "  Downloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)\n",
+      "Collecting pyarrow-hotfix (from datasets>=2.15.0)\n",
+      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n",
+      "Collecting hjson (from deepspeed)\n",
+      "  Downloading hjson-3.1.0-py3-none-any.whl (54 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.0/54.0 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting py-cpuinfo (from deepspeed)\n",
+      "  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n",
+      "Collecting termcolor (from fire)\n",
+      "  Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.10/site-packages (from gcsfs) (5.1.1)\n",
+      "INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.\n",
+      "Collecting gcsfs\n",
+      "  Downloading gcsfs-2023.12.1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "  Downloading gcsfs-2023.12.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "  Downloading gcsfs-2023.10.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting google-auth>=1.2 (from gcsfs)\n",
+      "  Downloading google_auth-2.25.2-py2.py3-none-any.whl.metadata (4.7 kB)\n",
+      "Collecting google-auth-oauthlib (from gcsfs)\n",
+      "  Downloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl.metadata (2.7 kB)\n",
+      "Collecting google-cloud-storage (from gcsfs)\n",
+      "  Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl.metadata (6.1 kB)\n",
+      "Collecting llvmlite<0.42,>=0.41.0dev0 (from numba)\n",
+      "  Downloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n",
+      "Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)\n",
+      "  Downloading aiobotocore-2.9.0-py3-none-any.whl.metadata (20 kB)\n",
+      "INFO: pip is looking at multiple versions of s3fs to determine which version is compatible with other requirements. This could take a while.\n",
+      "Collecting s3fs\n",
+      "  Downloading s3fs-2023.12.1-py3-none-any.whl.metadata (1.6 kB)\n",
+      "  Downloading s3fs-2023.10.0-py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting aiobotocore~=2.7.0 (from s3fs)\n",
+      "  Downloading aiobotocore-2.7.0-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting grpcio>=1.48.2 (from tensorboard)\n",
+      "  Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)\n",
+      "Collecting markdown>=2.6.8 (from tensorboard)\n",
+      "  Downloading Markdown-3.5.1-py3-none-any.whl.metadata (7.1 kB)\n",
+      "Collecting protobuf<4.24,>=3.19.6 (from tensorboard)\n",
+      "  Downloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (540 bytes)\n",
+      "Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard) (68.0.0)\n",
+      "Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)\n",
+      "  Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)\n",
+      "Collecting werkzeug>=1.0.1 (from tensorboard)\n",
+      "  Downloading werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.1 in /opt/conda/lib/python3.10/site-packages (from wandb) (8.1.7)\n",
+      "Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)\n",
+      "  Downloading GitPython-3.1.40-py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting sentry-sdk>=1.0.0 (from wandb)\n",
+      "  Downloading sentry_sdk-1.39.1-py2.py3-none-any.whl.metadata (9.7 kB)\n",
+      "Collecting docker-pycreds>=0.4.0 (from wandb)\n",
+      "  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
+      "Collecting setproctitle (from wandb)\n",
+      "  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)\n",
+      "Collecting appdirs>=1.4.3 (from wandb)\n",
+      "  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n",
+      "Collecting botocore<1.31.65,>=1.31.16 (from aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading botocore-1.31.64-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->fschat==0.2.34) (23.1.0)\n",
+      "Collecting multidict<7.0,>=4.5 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting yarl<2.0,>=1.0 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)\n",
+      "Collecting frozenlist>=1.1.1 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+      "Collecting aiosignal>=1.1.2 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
+      "Collecting async-timeout<5.0,>=4.0 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)\n",
+      "Requirement already satisfied: jsonschema>=3.0 in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (4.20.0)\n",
+      "Requirement already satisfied: toolz in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (0.12.0)\n",
+      "Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)\n",
+      "  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n",
+      "Collecting cachetools<6.0,>=2.0.0 (from google-auth>=1.2->gcsfs)\n",
+      "  Downloading cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)\n",
+      "Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.2->gcsfs)\n",
+      "  Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.3/181.3 kB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rsa<5,>=3.1.4 (from google-auth>=1.2->gcsfs)\n",
+      "  Downloading rsa-4.9-py3-none-any.whl (34 kB)\n",
+      "Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib->gcsfs)\n",
+      "  Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n",
+      "Collecting contourpy>=1.0.1 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
+      "Collecting cycler>=0.10 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
+      "Collecting fonttools>=4.22.0 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (157 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m157.2/157.2 kB\u001b[0m \u001b[31m41.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.4 kB)\n",
+      "Collecting pyparsing>=2.3.1 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading pyparsing-3.1.1-py3-none-any.whl.metadata (5.1 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->bert-score==0.3.13) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.1->bert-score==0.3.13) (2023.3.post1)\n",
+      "Collecting tzdata>=2022.1 (from pandas>=1.0.1->bert-score==0.3.13)\n",
+      "  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: wcwidth in /opt/conda/lib/python3.10/site-packages (from prompt-toolkit>=3.0.0->fschat==0.2.34) (0.2.5)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2023.7.22)\n",
+      "Collecting markdown-it-py>=2.2.0 (from rich>=10.0.0->fschat==0.2.34)\n",
+      "  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich>=10.0.0->fschat==0.2.34) (2.15.1)\n",
+      "Collecting h11>=0.8 (from uvicorn->fschat==0.2.34)\n",
+      "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->optimum==1.13.2)\n",
+      "  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting starlette<0.33.0,>=0.29.0 (from fastapi->fschat==0.2.34)\n",
+      "  Downloading starlette-0.32.0.post1-py3-none-any.whl.metadata (5.8 kB)\n",
+      "Collecting typing-extensions~=4.0 (from gradio==3.50.2)\n",
+      "  Downloading typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)\n",
+      "Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_api_core-2.15.0-py3-none-any.whl.metadata (2.7 kB)\n",
+      "Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)\n",
+      "Collecting google-resumable-media>=2.6.0 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl.metadata (2.2 kB)\n",
+      "Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\n",
+      "Requirement already satisfied: anyio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (4.2.0)\n",
+      "Collecting httpcore==1.* (from httpx->fschat==0.2.34)\n",
+      "  Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)\n",
+      "Requirement already satisfied: sniffio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (1.3.0)\n",
+      "Collecting wavedrom (from markdown2[all]->fschat==0.2.34)\n",
+      "  Downloading wavedrom-2.0.3.post3.tar.gz (137 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.7/137.7 kB\u001b[0m \u001b[31m47.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->optimum==1.13.2) (1.3.0)\n",
+      "Collecting jmespath<2.0.0,>=0.7.1 (from botocore<1.31.65,>=1.31.16->aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
+      "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)\n",
+      "  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n",
+      "Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage->gcsfs)\n",
+      "  Downloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl.metadata (1.5 kB)\n",
+      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (2023.12.1)\n",
+      "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.32.0)\n",
+      "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.15.2)\n",
+      "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.0.0->fschat==0.2.34)\n",
+      "  Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
+      "Collecting pyasn1<0.6.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs)\n",
+      "  Downloading pyasn1-0.5.1-py2.py3-none-any.whl.metadata (8.6 kB)\n",
+      "Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs)\n",
+      "  Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m50.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: exceptiongroup>=1.0.2 in /opt/conda/lib/python3.10/site-packages (from anyio->httpx->fschat==0.2.34) (1.0.4)\n",
+      "Collecting svgwrite (from wavedrom->markdown2[all]->fschat==0.2.34)\n",
+      "  Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.1/67.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m89.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading fschat-0.2.34-py3-none-any.whl (220 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m220.1/220.1 kB\u001b[0m \u001b[31m63.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading gradio-3.50.2-py3-none-any.whl (20.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.3/20.3 MB\u001b[0m \u001b[31m82.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading peft-0.6.0-py3-none-any.whl (134 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.9/134.9 kB\u001b[0m \u001b[31m40.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m90.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl (213.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 MB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading gradio_client-0.6.1-py3-none-any.whl (299 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m299.2/299.2 kB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading bitsandbytes-0.41.3.post2-py3-none-any.whl (92.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading datasets-2.16.0-py3-none-any.whl (507 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m87.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.4/36.4 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading art-6.1-py3-none-any.whl (599 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m599.8/599.8 kB\u001b[0m \u001b[31m96.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading gcsfs-2023.10.0-py2.py3-none-any.whl (33 kB)\n",
+      "Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m99.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading s3fs-2023.10.0-py3-none-any.whl (28 kB)\n",
+      "Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m96.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading wandb-0.16.1-py3-none-any.whl (2.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m99.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading absl_py-2.0.0-py3-none-any.whl (130 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading aiobotocore-2.7.0-py3-none-any.whl (73 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.5/73.5 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
+      "Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading altair-5.2.0-py3-none-any.whl (996 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m996.9/996.9 kB\u001b[0m \u001b[31m110.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading GitPython-3.1.40-py3-none-any.whl (190 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.6/190.6 kB\u001b[0m \u001b[31m47.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_auth-2.25.2-py2.py3-none-any.whl (184 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m184.2/184.2 kB\u001b[0m \u001b[31m44.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl (24 kB)\n",
+      "Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m102.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading importlib_resources-6.1.1-py3-none-any.whl (33 kB)\n",
+      "Downloading joblib-1.3.2-py3-none-any.whl (302 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.2/302.2 kB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 MB\u001b[0m \u001b[31m74.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading Markdown-3.5.1-py3-none-any.whl (102 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.2/102.2 kB\u001b[0m \u001b[31m34.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m99.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m96.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.5/304.5 kB\u001b[0m \u001b[31m68.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.0/38.0 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m95.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.0/774.0 kB\u001b[0m \u001b[31m116.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading rich-13.7.0-py3-none-any.whl (240 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m240.6/240.6 kB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m102.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading sentry_sdk-1.39.1-py2.py3-none-any.whl (254 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m254.1/254.1 kB\u001b[0m \u001b[31m71.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl (6.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)\n",
+      "Downloading uvicorn-0.25.0-py3-none-any.whl (60 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.7/226.7 kB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading fastapi-0.108.0-py3-none-any.whl (92 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
+      "Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl (121 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 kB\u001b[0m \u001b[31m36.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m48.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m108.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.2/307.2 kB\u001b[0m \u001b[31m66.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+      "Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
+      "Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n",
+      "Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m101.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m44.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n",
+      "Downloading botocore-1.31.64-py3-none-any.whl (11.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/11.3 MB\u001b[0m \u001b[31m98.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\n",
+      "Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (310 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.7/310.7 kB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
+      "Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m102.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.5/239.5 kB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.0/122.0 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\n",
+      "Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.6/80.6 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyparsing-3.1.1-py3-none-any.whl (103 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.1/103.1 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading starlette-0.32.0.post1-py3-none-any.whl (70 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.0/70.0 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (80 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.3/80.3 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.6/301.6 kB\u001b[0m \u001b[31m80.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading markdown2-2.4.12-py2.py3-none-any.whl (41 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.2/41.2 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl (228 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m228.7/228.7 kB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.9/84.9 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
+      "Building wheels for collected packages: flash-attn, optimum, rouge-score, deepspeed, fire, ffmpy, wavedrom\n",
+      "  Building wheel for flash-attn (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for flash-attn: filename=flash_attn-2.3.3-cp310-cp310-linux_x86_64.whl size=57042553 sha256=b1df92cb5bd7657d38b789dd48e907aa3e0bd2715c817eb85f3c4320bb11fb3f\n",
+      "  Stored in directory: /root/.cache/pip/wheels/e5/e6/fa/941802ec61d1afd320d27160ab1db98e6dba65381f84b76d4a\n",
+      "  Building wheel for optimum (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for optimum: filename=optimum-1.13.2-py3-none-any.whl size=395599 sha256=ff3a73120e1b6eeeda28f76e3fc8cd4cd826e5d66c869b7848ba150e7af79c62\n",
+      "  Stored in directory: /root/.cache/pip/wheels/6e/b7/2c/79405d98f0943373d8546daeae25a3d377f7659ca0cbe48699\n",
+      "  Building wheel for rouge-score (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=8118ecbbcd3529085e794c803f0ddb182fc6c6d3e8a494103b49a94abf1bec37\n",
+      "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
+      "  Building wheel for deepspeed (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for deepspeed: filename=deepspeed-0.12.6-py3-none-any.whl size=1306729 sha256=35c46b6f0275b0d3063522e0af4f3cbd9ec1c310114d8917d87cbe2bf43346e2\n",
+      "  Stored in directory: /root/.cache/pip/wheels/a3/dc/a2/f585faaed4dec84108916dcc8e8a7c129a216df8202ca32984\n",
+      "  Building wheel for fire (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for fire: filename=fire-0.5.0-py2.py3-none-any.whl size=116934 sha256=e76d5185f237f34ec69bb8aa657497bef07408978e4f7efdaef48663bb8cd4ef\n",
+      "  Stored in directory: /root/.cache/pip/wheels/90/d4/f7/9404e5db0116bd4d43e5666eaa3e70ab53723e1e3ea40c9a95\n",
+      "  Building wheel for ffmpy (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5579 sha256=da3b54dc0ac1a825a1a233315970ac80b8b4c53ebd9cb2a2cfdeab118f453a64\n",
+      "  Stored in directory: /root/.cache/pip/wheels/01/a6/d1/1c0828c304a4283b2c1639a09ad86f83d7c487ef34c6b4a1bf\n",
+      "  Building wheel for wavedrom (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for wavedrom: filename=wavedrom-2.0.3.post3-py2.py3-none-any.whl size=30052 sha256=7f0cbd15d63ee9c120190bac122ab51bbbfc91ee374bc3c046fadb320816c17e\n",
+      "  Stored in directory: /root/.cache/pip/wheels/9c/52/8c/38b454b42f712f325e26f633287484c7dc1ad469e1580c5954\n",
+      "Successfully built flash-attn optimum rouge-score deepspeed fire ffmpy wavedrom\n",
+      "Installing collected packages: sentencepiece, pydub, py-cpuinfo, ninja, nh3, hjson, ffmpy, bitsandbytes, appdirs, addict, xxhash, wrapt, werkzeug, websockets, tzdata, typing-extensions, threadpoolctl, termcolor, tensorboard-data-server, svgwrite, smmap, shortuuid, setproctitle, sentry-sdk, semantic-version, scipy, safetensors, rouge, regex, python-multipart, pyparsing, pynvml, pyasn1, pyarrow-hotfix, pyarrow, protobuf, orjson, oauthlib, multidict, mdurl, markdown2, markdown, llvmlite, kiwisolver, joblib, jmespath, importlib-resources, humanfriendly, hf_transfer, h11, grpcio, google-crc32c, gekko, frozenlist, fonttools, einops, docker-pycreds, dill, cycler, contourpy, colorama, cachetools, async-timeout, art, aioitertools, aiofiles, absl-py, yarl, wavedrom, uvicorn, tiktoken, scikit-learn, rsa, responses, requests-oauthlib, pydantic, pyasn1-modules, pandas, numba, nltk, multiprocess, matplotlib, markdown-it-py, httpcore, googleapis-common-protos, google-resumable-media, gitdb, fire, coloredlogs, botocore, aiosignal, xformers, tokenizers, starlette, rouge-score, rich, httpx, google-auth, GitPython, flash-attn, deepspeed, aiohttp, accelerate, wandb, transformers, gradio-client, google-auth-oauthlib, google-api-core, fastapi, altair, aiobotocore, tensorboard, s3fs, peft, gradio, google-cloud-core, fschat, datasets, bert-score, optimum, google-cloud-storage, evaluate, auto-gptq, gcsfs, axolotl\n",
+      "  Attempting uninstall: typing-extensions\n",
+      "    Found existing installation: typing_extensions 4.7.1\n",
+      "    Uninstalling typing_extensions-4.7.1:\n",
+      "      Successfully uninstalled typing_extensions-4.7.1\n",
+      "  Running setup.py develop for axolotl\n",
+      "Successfully installed GitPython-3.1.40 absl-py-2.0.0 accelerate-0.24.1 addict-2.4.0 aiobotocore-2.7.0 aiofiles-23.2.1 aiohttp-3.9.1 aioitertools-0.11.0 aiosignal-1.3.1 altair-5.2.0 appdirs-1.4.4 art-6.1 async-timeout-4.0.3 auto-gptq-0.5.1 axolotl-0.3.0 bert-score-0.3.13 bitsandbytes-0.41.3.post2 botocore-1.31.64 cachetools-5.3.2 colorama-0.4.6 coloredlogs-15.0.1 contourpy-1.2.0 cycler-0.12.1 datasets-2.16.0 deepspeed-0.12.6 dill-0.3.7 docker-pycreds-0.4.0 einops-0.7.0 evaluate-0.4.0 fastapi-0.108.0 ffmpy-0.3.1 fire-0.5.0 flash-attn-2.3.3 fonttools-4.47.0 frozenlist-1.4.1 fschat-0.2.34 gcsfs-2023.10.0 gekko-1.0.6 gitdb-4.0.11 google-api-core-2.15.0 google-auth-2.25.2 google-auth-oauthlib-1.2.0 google-cloud-core-2.4.1 google-cloud-storage-2.14.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 googleapis-common-protos-1.62.0 gradio-3.50.2 gradio-client-0.6.1 grpcio-1.60.0 h11-0.14.0 hf_transfer-0.1.4 hjson-3.1.0 httpcore-1.0.2 httpx-0.26.0 humanfriendly-10.0 importlib-resources-6.1.1 jmespath-1.0.1 joblib-1.3.2 kiwisolver-1.4.5 llvmlite-0.41.1 markdown-3.5.1 markdown-it-py-3.0.0 markdown2-2.4.12 matplotlib-3.8.2 mdurl-0.1.2 multidict-6.0.4 multiprocess-0.70.15 nh3-0.2.15 ninja-1.11.1.1 nltk-3.8.1 numba-0.58.1 oauthlib-3.2.2 optimum-1.13.2 orjson-3.9.10 pandas-2.1.4 peft-0.6.0 protobuf-4.23.4 py-cpuinfo-9.0.0 pyarrow-14.0.2 pyarrow-hotfix-0.6 pyasn1-0.5.1 pyasn1-modules-0.3.0 pydantic-1.10.13 pydub-0.25.1 pynvml-11.5.0 pyparsing-3.1.1 python-multipart-0.0.6 regex-2023.12.25 requests-oauthlib-1.3.1 responses-0.18.0 rich-13.7.0 rouge-1.0.1 rouge-score-0.1.2 rsa-4.9 s3fs-2023.10.0 safetensors-0.4.1 scikit-learn-1.2.2 scipy-1.11.4 semantic-version-2.10.0 sentencepiece-0.1.99 sentry-sdk-1.39.1 setproctitle-1.3.3 shortuuid-1.0.11 smmap-5.0.1 starlette-0.32.0.post1 svgwrite-1.4.3 tensorboard-2.15.1 tensorboard-data-server-0.7.2 termcolor-2.4.0 threadpoolctl-3.2.0 tiktoken-0.5.2 tokenizers-0.15.0 transformers-4.36.2 typing-extensions-4.8.0 tzdata-2023.3 uvicorn-0.25.0 wandb-0.16.1 wavedrom-2.0.3.post3 websockets-11.0.3 werkzeug-3.0.1 wrapt-1.16.0 xformers-0.0.23 xxhash-3.4.1 yarl-1.9.4\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mCollecting git+https://github.com/huggingface/peft.git\n",
+      "  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-hka8xgk2\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-hka8xgk2\n",
+      "  Resolved https://github.com/huggingface/peft.git to commit cf04d0353f0343cbf66627228c4495f51669af34\n",
+      "  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (1.26.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (23.1)\n",
+      "Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (5.9.0)\n",
+      "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (6.0.1)\n",
+      "Requirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (2.1.1)\n",
+      "Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.36.2)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.65.0)\n",
+      "Requirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.24.1)\n",
+      "Requirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.4.1)\n",
+      "Requirement already satisfied: huggingface-hub>=0.17.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.20.1)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.9.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.10.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.31.0)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (4.8.0)\n",
+      "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (1.11.1)\n",
+      "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1)\n",
+      "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (2023.12.25)\n",
+      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (0.15.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft==0.7.2.dev0) (2.1.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.7.22)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft==0.7.2.dev0) (1.3.0)\n",
+      "Building wheels for collected packages: peft\n",
+      "  Building wheel for peft (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for peft: filename=peft-0.7.2.dev0-py3-none-any.whl size=169456 sha256=4c70d23e759fa6abb3827fb2f3a8683be3b24d78777d0f403bbc2c0548e5dd4b\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-my5ncou6/wheels/d7/c7/de/1368fac8590e1b103ddc2ec2a28ad51d83aded1a3830e8a087\n",
+      "Successfully built peft\n",
+      "Installing collected packages: peft\n",
+      "  Attempting uninstall: peft\n",
+      "    Found existing installation: peft 0.6.0\n",
+      "    Uninstalling peft-0.6.0:\n",
+      "      Successfully uninstalled peft-0.6.0\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "axolotl 0.3.0 requires peft==0.6.0, but you have peft 0.7.2.dev0 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed peft-0.7.2.dev0\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "#instaling what is needed inside axolotl file\n",
+    "!pip install packaging\n",
+    "!pip install -e '.[flash-attn,deepspeed]'\n",
+    "!pip install -U git+https://github.com/huggingface/peft.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "82d1a380-1e87-48fe-89fe-25331326014d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The following values were not passed to `accelerate launch` and had defaults used instead:\n",
+      "\t`--num_processes` was set to a value of `3`\n",
+      "\t\tMore than one GPU was found, enabling multi-GPU training.\n",
+      "\t\tIf this was unintended please pass in `--num_processes=1`.\n",
+      "\t`--num_machines` was set to a value of `1`\n",
+      "\t`--mixed_precision` was set to a value of `'no'`\n",
+      "\t`--dynamo_backend` was set to a value of `'no'`\n",
+      "To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.\n",
+      "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "[2023-12-28 15:44:09,979] [INFO] [datasets.<module>:58] [PID:2814] PyTorch version 2.1.1 available.\n",
+      "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "[2023-12-28 15:44:10,011] [INFO] [datasets.<module>:58] [PID:2812] PyTorch version 2.1.1 available.\n",
+      "[2023-12-28 15:44:10,013] [INFO] [datasets.<module>:58] [PID:2813] PyTorch version 2.1.1 available.\n",
+      "[2023-12-28 15:44:10,805] [INFO] [axolotl.normalize_config:150] [PID:2814] [RANK:2] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:10,830] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2023-12-28 15:44:10,842] [INFO] [axolotl.normalize_config:150] [PID:2813] [RANK:1] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:10,865] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2023-12-28 15:44:10,869] [INFO] [axolotl.normalize_config:150] [PID:2812] [RANK:0] GPU memory usage baseline: 0.000GB (+0.351GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:10,887] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2023-12-28 15:44:10,961] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+      "[2023-12-28 15:44:10,994] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+      "[2023-12-28 15:44:11,015] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+      "[2023-12-28 15:44:11,015] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
+      "                                 dP            dP   dP \n",
+      "                                 88            88   88 \n",
+      "      .d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88 \n",
+      "      88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88 \n",
+      "      88.  .88  .d88b.  88.  .88 88 88.  .88   88   88 \n",
+      "      `88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP \n",
+      "                                                       \n",
+      "                                                       \n",
+      "\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:11,413] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2812] [RANK:0] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
+      "[2023-12-28 15:44:11,415] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2812] [RANK:0] Prepared dataset loaded from disk...\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:11,530] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2813] [RANK:1] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
+      "[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2814] [RANK:2] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
+      "[2023-12-28 15:44:12,160] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2813] [RANK:1] Prepared dataset loaded from disk...\u001b[39m\n",
+      "[2023-12-28 15:44:12,161] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2814] [RANK:2] Prepared dataset loaded from disk...\u001b[39m\n",
+      "[2023-12-28 15:44:12,236] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_tokens: 28120\u001b[39m\n",
+      "[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] `total_supervised_tokens: 7990`\u001b[39m\n",
+      "[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_steps: 6\u001b[39m\n",
+      "[2023-12-28 15:44:12,242] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading tokenizer... mistralai/Mistral-7B-v0.1\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading model and peft_config...\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:13,049] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 7.24B\n",
+      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00,  5.81s/it]\n",
+      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00,  5.98s/it]\n",
+      "[2023-12-28 15:44:25,395] [INFO] [axolotl.load_model:503] [PID:2813] [RANK:1] GPU memory usage after model load: 7.576GB (+0.524GB cache, +0.708GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:25,399] [INFO] [axolotl.load_model:526] [PID:2813] [RANK:1] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+      "[2023-12-28 15:44:25,403] [INFO] [axolotl.load_model:538] [PID:2813] [RANK:1] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
+      "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
+      "[2023-12-28 15:44:25,480] [INFO] [axolotl.load_model:568] [PID:2813] [RANK:1] GPU memory usage after adapters: 7.589GB (+1.501GB cache, +0.708GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:25,572] [INFO] [axolotl.load_model:503] [PID:2814] [RANK:2] GPU memory usage after model load: 7.576GB (+0.410GB cache, +0.708GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:25,576] [INFO] [axolotl.load_model:526] [PID:2814] [RANK:2] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+      "[2023-12-28 15:44:25,580] [INFO] [axolotl.load_model:538] [PID:2814] [RANK:2] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
+      "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
+      "[2023-12-28 15:44:25,660] [INFO] [axolotl.load_model:568] [PID:2814] [RANK:2] GPU memory usage after adapters: 7.589GB (+1.388GB cache, +0.708GB misc)\u001b[39m\n",
+      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:12<00:00,  6.30s/it]\n",
+      "[2023-12-28 15:44:26,170] [INFO] [axolotl.load_model:503] [PID:2812] [RANK:0] GPU memory usage after model load: 7.576GB (+0.776GB cache, +0.741GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:26,177] [INFO] [axolotl.load_model:526] [PID:2812] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+      "[2023-12-28 15:44:26,181] [INFO] [axolotl.load_model:538] [PID:2812] [RANK:0] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
+      "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
+      "[2023-12-28 15:44:26,259] [INFO] [axolotl.load_model:568] [PID:2812] [RANK:0] GPU memory usage after adapters: 7.589GB (+1.753GB cache, +0.741GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:26,293] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Pre-saving adapter config to ./out\u001b[39m\n",
+      "[2023-12-28 15:44:26,296] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Starting trainer...\u001b[39m\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05891108512878418 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10173463821411133 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10152459144592285 seconds\n",
+      "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Parameter Offload: Total persistent parameters: 3674112 in 193 params\n",
+      "  0%|                                                    | 0/17 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+      "/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+      "/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+      "{'loss': 2.0448, 'learning_rate': 2e-05, 'epoch': 0.06}                         \n",
+      "  6%|██▌                                         | 1/17 [00:28<07:32, 28.30s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.85s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 1.9694719314575195, 'eval_runtime': 11.391, 'eval_samples_per_second': 1.492, 'eval_steps_per_second': 0.263, 'epoch': 0.06}\n",
+      "  6%|██▌                                         | 1/17 [00:39<07:32, 28.30s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.65s/it]\u001b[A\n",
+      "                                                                                \u001b[A[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2812] [RANK:0] GPU memory usage while training: 12.210GB (+4.259GB cache, +0.776GB misc)\u001b[39m\n",
+      " 12%|█████▏                                      | 2/17 [01:04<08:18, 33.20s/it][2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2814] [RANK:2] GPU memory usage while training: 12.269GB (+4.522GB cache, +0.743GB misc)\u001b[39m\n",
+      "[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2813] [RANK:1] GPU memory usage while training: 12.283GB (+4.493GB cache, +0.743GB misc)\u001b[39m\n",
+      "{'loss': 2.0022, 'learning_rate': 4e-05, 'epoch': 0.12}                         \n",
+      "{'loss': 2.1054, 'learning_rate': 6e-05, 'epoch': 0.17}                         \n",
+      "{'loss': 1.9004, 'learning_rate': 8e-05, 'epoch': 0.23}                         \n",
+      "{'loss': 1.8794, 'learning_rate': 0.0001, 'epoch': 0.29}                        \n",
+      " 29%|████████████▉                               | 5/17 [02:20<05:23, 26.92s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.88s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 1.7912336587905884, 'eval_runtime': 11.3106, 'eval_samples_per_second': 1.503, 'eval_steps_per_second': 0.265, 'epoch': 0.29}\n",
+      " 29%|████████████▉                               | 5/17 [02:32<05:23, 26.92s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.67s/it]\u001b[A\n",
+      "{'loss': 1.7871, 'learning_rate': 0.00012, 'epoch': 0.35}                       \u001b[A\n",
+      "{'loss': 1.7758, 'learning_rate': 0.00014, 'epoch': 0.4}                        \n",
+      "{'loss': 1.4645, 'learning_rate': 0.00016, 'epoch': 0.46}                       \n",
+      "{'loss': 1.4009, 'learning_rate': 0.00018, 'epoch': 0.52}                       \n",
+      "{'loss': 1.3927, 'learning_rate': 0.0002, 'epoch': 0.58}                        \n",
+      " 59%|█████████████████████████▎                 | 10/17 [04:38<03:04, 26.33s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.89s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 1.1426481008529663, 'eval_runtime': 11.3344, 'eval_samples_per_second': 1.5, 'eval_steps_per_second': 0.265, 'epoch': 0.58}\n",
+      " 59%|█████████████████████████▎                 | 10/17 [04:49<03:04, 26.33s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.68s/it]\u001b[A\n",
+      "{'loss': 1.0122, 'learning_rate': 0.0001900968867902419, 'epoch': 0.63}         \u001b[A\n",
+      "{'loss': 1.0019, 'learning_rate': 0.00016234898018587337, 'epoch': 0.69}        \n",
+      "{'loss': 0.8976, 'learning_rate': 0.00012225209339563145, 'epoch': 0.75}        \n",
+      "{'loss': 0.9301, 'learning_rate': 7.774790660436858e-05, 'epoch': 0.81}         \n",
+      "{'loss': 0.8595, 'learning_rate': 3.7651019814126654e-05, 'epoch': 0.87}        \n",
+      " 88%|█████████████████████████████████████▉     | 15/17 [06:55<00:52, 26.17s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.88s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 0.8175248503684998, 'eval_runtime': 11.2932, 'eval_samples_per_second': 1.505, 'eval_steps_per_second': 0.266, 'epoch': 0.87}\n",
+      " 88%|█████████████████████████████████████▉     | 15/17 [07:06<00:52, 26.17s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.67s/it]\u001b[A\n",
+      "{'loss': 0.7931, 'learning_rate': 9.903113209758096e-06, 'epoch': 0.92}         \u001b[A\n",
+      "{'loss': 0.6909, 'learning_rate': 0.0, 'epoch': 0.98}                           \n",
+      "100%|███████████████████████████████████████████| 17/17 [07:56<00:00, 28.03s/it]/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "{'train_runtime': 489.0649, 'train_samples_per_second': 0.63, 'train_steps_per_second': 0.035, 'train_loss': 1.408153467318591, 'epoch': 0.98}\n",
+      "100%|███████████████████████████████████████████| 17/17 [08:09<00:00, 28.77s/it]\n",
+      "[2023-12-28 15:52:39,488] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Training Completed!!! Saving pre-trained model to ./out\u001b[39m\n",
+      "\u001b[0m\u001b[0m\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "Training using the config.yml file and using deepspeed:zero3_bf16 the most aggressive optimization out of zero1,zero2,zero3 stages which partitions \n",
+    "not only optimizer states but also gradients and parameters across GPUs. The bf16 indicate mixed precision training using bfloat16.\n",
+    "For more information read axolotl's readme\n",
+    "\"\"\"\n",
+    "!accelerate launch -m axolotl.cli.train /folder/config.yml --deepspeed deepspeed_configs/zero3_bf16.json"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/mistral/Mistral-7b-example/config.yml
+++ b/examples/mistral/Mistral-7b-example/config.yml
@@ -1,3 +1,4 @@
+#Mistral-7b
 base_model: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
@@ -7,32 +8,26 @@ load_in_4bit: false
 strict: false

 datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./lora-out
+  - path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
+    #for type,conversation arguments read axolotl readme and pick what is suited for your project, I wanted a chatbot and put sharegpt and chatml
+    type: sharegpt
+    conversation: chatml
+dataset_prepared_path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
+val_set_size: 0.05
+output_dir: ./out

+#using lora for lower cost
 adapter: lora
-lora_model_dir:
-
-sequence_len: 8192
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
+lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
 lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
  - q_proj
  - v_proj
-  - k_proj
-  - o_proj
+
+sequence_len: 512
+sample_packing: false
+pad_to_sequence_len: true

 wandb_project:
 wandb_entity:
@@ -40,17 +35,18 @@ wandb_watch:
 wandb_name:
 wandb_log_model:

-gradient_accumulation_steps: 4
+#only 2 epochs because of small dataset
+gradient_accumulation_steps: 3
 micro_batch_size: 2
-num_epochs: 1
+num_epochs: 2
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: auto
-fp16:
+bf16: true
+fp16: false
 tf32: false

 gradient_checkpointing: true
@@ -61,17 +57,18 @@ logging_steps: 1
 xformers_attention:
 flash_attention: true

-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
-deepspeed:
+#default deepspeed, can use more aggresive if needed like zero2, zero3
+deepspeed: deepspeed_configs/zero1.json
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/mistral/Mistral-7b-example/data.jsonl
+++ b/examples/mistral/Mistral-7b-example/data.jsonl
@@ -0,0 +1,10 @@
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who is the Founder of Apple\""}, {"from": "gpt", "value": "\"<Chatbot>: The founder of Apple is Steve Jobs\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the capital of France?\""}, {"from": "gpt", "value": "\"<Chatbot>: The capital of France is Paris.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: How far is the Moon from Earth?\""}, {"from": "gpt", "value": "\"<Chatbot>: The Moon is approximately 384,400 kilometers from Earth.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the tallest mountain in the world?\""}, {"from": "gpt", "value": "\"<Chatbot>: The tallest mountain in the world is Mount Everest.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who wrote Romeo and Juliet?\""}, {"from": "gpt", "value": "\"<Chatbot>: Romeo and Juliet was written by William Shakespeare.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the boiling point of water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The boiling point of water is 100 degrees Celsius.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: When was the first man on the moon?\""}, {"from": "gpt", "value": "\"<Chatbot>: The first man landed on the moon in 1969.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the largest ocean?\""}, {"from": "gpt", "value": "\"<Chatbot>: The largest ocean is the Pacific Ocean.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who invented the telephone?\""}, {"from": "gpt", "value": "\"<Chatbot>: The telephone was invented by Alexander Graham Bell.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the formula for water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The chemical formula for water is H2O.\""}]}
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -56,3 +56,6 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/mistral/mixtral_fused.py
+++ b/examples/mistral/mixtral_fused.py
@@ -0,0 +1,75 @@
+import gc
+import torch
+from tqdm import tqdm
+from axolotl.monkeypatch.moe.moe import SparseMoeBlock
+from transformers import AutoTokenizer, TextStreamer
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock, MixtralForCausalLM, MixtralConfig
+
+def compute_memory_used_pct(device):
+    memory_used = torch.cuda.max_memory_allocated(device) / (1024**3)
+    memory_pct = (
+        memory_used
+        / (torch.cuda.get_device_properties(device).total_memory / (1024**3))
+        * 100
+    )
+    return memory_pct
+
+model_path = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+# Load model
+config = MixtralConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False)
+model = MixtralForCausalLM.from_pretrained(
+    model_path,
+    config=config,
+    device_map="auto",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16,
+)
+modules = {k:v for k,v in model.named_modules() if isinstance(v, MixtralSparseMoeBlock)}
+
+for device_index in range(torch.cuda.device_count()):
+    device_memory_pct = compute_memory_used_pct(device_index)
+    print(device_index, device_memory_pct)
+
+with tqdm(modules.items(), desc="scatter moe") as pbar:
+    for i, (name, module) in enumerate(pbar):
+        smoe = SparseMoeBlock(
+            experts=module.experts,
+            gate=module.gate,
+            hidden_dim=module.hidden_dim,
+            ffn_dim=module.ffn_dim,
+            num_experts=module.num_experts,
+            top_k=module.top_k,
+        )
+        old_module = model.model.layers[i].block_sparse_moe
+        setattr(model.model.layers[i], "block_sparse_moe", smoe)
+        del old_module
+        torch.cuda.empty_cache()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        for device_index in range(torch.cuda.device_count()):
+            device_memory_pct = compute_memory_used_pct(device_index)
+            print(device_index, device_memory_pct)
+
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+# Convert prompt to tokens
+prompt_template = "[INST] {prompt} [/INST]"
+
+prompt = "You're standing on the surface of the Earth. "\
+        "You walk one mile south, one mile west and one mile north. "\
+        "You end up exactly where you started. Where are you?"
+
+tokens = tokenizer(
+    prompt_template.format(prompt=prompt), 
+    return_tensors='pt'
+).input_ids.cuda()
+
+# Generate output
+generation_output = model.generate(
+    tokens, 
+    streamer=streamer,
+    max_new_tokens=512
+)
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -75,3 +75,6 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/qwen/README.md
+++ b/examples/qwen/README.md
@@ -1,10 +0,0 @@
-# Qwen
-
-TODO
-
-# Qwen2 MoE
-
-✅ multipack
-✅ qwen2_moe 4-bit QLoRA
-✅ qwen2_moe 16-bit LoRA
-❓ qwen2_moe 8-bit LoRA
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/qwen/qwen2-moe-lora.yaml
@@ -1,64 +0,0 @@
-base_model: Qwen/Qwen1.5-MoE-A2.7B
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./out
-
-sequence_len: 1024  # supports up to 32k
-sample_packing: false
-pad_to_sequence_len: false
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 4
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/qwen/qwen2-moe-qlora.yaml
@@ -1,64 +0,0 @@
-base_model: Qwen/Qwen1.5-MoE-A2.7B
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./out
-
-sequence_len: 1024  # supports up to 32k
-sample_packing: false
-pad_to_sequence_len: false
-
-adapter: qlora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 4
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/favicon.jpg
+++ b/favicon.jpg
--- a/index.qmd
+++ b/index.qmd
@@ -1,23 +0,0 @@
---
-toc-location: right-body
-toc-title: Table Of Contents
-toc-expand: 2
---
-
-```{python}
-#|output: asis
-#|echo: false
-
-# This cell steals the README as the home page for now, but excludes the table of contents (quarto adds its own)
-import re
-pattern = re.compile(
-    r"<table>\s*<tr>\s*<td>\s*## Table of Contents.*?</td>\s*</tr>\s*</table>",
-    re.DOTALL | re.IGNORECASE
-)
-
-with open('README.md', 'r') as f:
-    txt = f.read()
-
-cleaned = pattern.sub("", txt)
-print(cleaned)
-```
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,10 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft==0.10.0
-transformers @ git+https://github.com/huggingface/transformers.git@43d17c18360ac9c3d3491389328e2fe55fe8f9ce
+peft==0.9.0
+transformers==4.38.2
 tokenizers==0.15.0
-bitsandbytes==0.43.0
-accelerate==0.28.0
+bitsandbytes>=0.43.0
+accelerate==0.26.1
 deepspeed==0.13.1
 pydantic==2.6.3
 addict
@@ -32,12 +32,12 @@ fschat==0.2.36
 gradio==3.50.2
 tensorboard

-mamba-ssm==1.2.0.post1
+mamba-ssm==1.1.1

 # remote filesystems
 s3fs
 gcsfs
 # adlfs

-trl @ git+https://github.com/huggingface/trl.git@0ee349dcd43b0f4b3169449f16751c38ac4a609f
-zstandard==0.22.0
+trl>=0.7.9
+fastcore>=1.5.29
--- a/setup.py
+++ b/setup.py
@@ -78,7 +78,7 @@ setup(
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
-            "mamba-ssm==1.2.0.post1",
+            "mamba-ssm==1.0.1",
        ],
        "auto-gptq": [
            "auto-gptq==0.5.1",
@@ -89,8 +89,5 @@ setup(
        "lion-pytorch": [
            "lion-pytorch==0.1.2",
        ],
-        "galore": [
-            "galore_torch",
-        ],
    },
 )
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -24,7 +24,6 @@ from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
 from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
 from transformers.utils import is_torch_bf16_gpu_available
-from transformers.utils.import_utils import _is_package_available

 from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
 from axolotl.logging_config import configure_logging
@@ -63,20 +62,6 @@ def print_axolotl_text_art(suffix=None):
    if is_main_process():
        print(ascii_art)

-    print_dep_versions()
-
-
-def print_dep_versions():
-    packages = ["accelerate", "peft", "transformers", "trl", "torch", "bitsandbytes"]
-    max_len = max(len(pkg) for pkg in packages)
-    if is_main_process():
-        print("*" * 40)
-        print("**** Axolotl Dependency Versions *****")
-        for pkg in packages:
-            version = _is_package_available(pkg, return_version=True)
-            print(f"{pkg: >{max_len}}: {version[1]: <15}")
-        print("*" * 40)
-

 def check_remote_config(config: Union[str, Path]):
    # Check if the config is a valid HTTPS URL to a .yml or .yaml file
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -38,8 +38,6 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
    parsed_cfg.load_in_4bit = False
    parsed_cfg.load_in_8bit = False
    parsed_cfg.flash_attention = False
-    parsed_cfg.deepspeed = None
-    parsed_cfg.fsdp = None

    do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)

--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -54,7 +54,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        LOG.warning(msg)
        parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

-    if parsed_cfg.rl and parsed_cfg.rl != "orpo":
+    if parsed_cfg.rl:
        load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
    else:
        load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -47,7 +47,7 @@ def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    else:
        register_chatml_template()

-    if cfg.rl and cfg.rl != "orpo":
+    if cfg.rl:
        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
    else:
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/core/policies/init.py
+++ b/src/axolotl/core/policies/init.py
--- a/src/axolotl/core/policies/auto_wrap.py
+++ b/src/axolotl/core/policies/auto_wrap.py
@@ -0,0 +1,55 @@
+"""module for building the auto wrap policy for FSDP"""
+import functools
+
+from peft import PrefixEncoder, PromptEmbedding, PromptEncoder
+from torch.distributed.fsdp.wrap import (
+    _or_policy,
+    lambda_auto_wrap_policy,
+    transformer_auto_wrap_policy,
+)
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
+from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer
+
+SUPPORTED_AUTO_WRAP_MODEL_TYPES = [
+    "llama",
+    "mistral",
+    "mixtral",
+]
+
+
+def get_wrapping_policy_factory(model_type):
+    if model_type == "llama":
+        layer_to_wrap = LlamaDecoderLayer
+    elif model_type == "mistral":
+        layer_to_wrap = MistralDecoderLayer
+    elif model_type == "mixtral":
+        layer_to_wrap = MixtralDecoderLayer
+
+    def get_wrapping_policy():
+        """This checks for lora layers (has weight and requires_grad)"""
+
+        def lambda_policy_fn(module):
+            return (
+                len(list(module.named_children())) == 0
+                and getattr(module, "weight", None) is not None
+                and module.weight.requires_grad
+            )
+
+        lambda_policy = functools.partial(
+            lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn
+        )
+        transformer_layer_name = layer_to_wrap
+        transformer_wrap_policy = functools.partial(
+            transformer_auto_wrap_policy,
+            transformer_layer_cls=(
+                PrefixEncoder,
+                PromptEncoder,
+                PromptEmbedding,
+                transformer_layer_name,
+            ),
+        )
+        policies = [lambda_policy, transformer_wrap_policy]
+        return functools.partial(_or_policy, policies=policies)
+
+    return get_wrapping_policy
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -8,22 +8,24 @@ import importlib
 import importlib.util
 import logging
 import math
+import os
 import sys
 from abc import abstractmethod
-from collections import defaultdict
 from dataclasses import dataclass, field
 from functools import wraps
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Type, Union
+from typing import List, Optional, Type, Union

 import torch
 import transformers
+from accelerate import FullyShardedDataParallelPlugin
+from accelerate.utils import str_to_bool
 from datasets import Dataset
+from torch.distributed.fsdp import MixedPrecision
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from transformers import (
    EarlyStoppingCallback,
-    PreTrainedModel,
    Trainer,
    TrainerCallback,
    TrainingArguments,
@@ -31,12 +33,11 @@ from transformers import (
 from transformers.trainer_utils import seed_worker
 from transformers.utils import is_sagemaker_mp_enabled
 from trl import DPOTrainer
-from trl.trainer.utils import pad_to_length

+from axolotl.core.policies.auto_wrap import get_wrapping_policy_factory
 from axolotl.loraplus import create_loraplus_optimizer
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
-from axolotl.utils import is_mlflow_available
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
    GPUStatsCallback,
@@ -47,7 +48,6 @@ from axolotl.utils.callbacks import (
    causal_lm_bench_eval_callback_factory,
    log_prediction_callback_factory,
 )
-from axolotl.utils.callbacks.lisa import lisa_callback_factory
 from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
@@ -72,6 +72,10 @@ except ImportError:
 LOG = logging.getLogger("axolotl.core.trainer_builder")


+def is_mlflow_available():
+    return importlib.util.find_spec("mlflow") is not None
+
+
 def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
    if isinstance(tag_names, str):
        tag_names = [tag_names]
@@ -196,21 +200,6 @@ class AxolotlTrainingArguments(TrainingArguments):
        default=False,
        metadata={"help": "whether this is a qlora training"},
    )
-    orpo_alpha: Optional[float] = field(
-        default=None,
-    )
-    lisa_n_layers: Optional[int] = field(
-        default=None,
-        metadata={"help": "the number of activate layers in LISA"},
-    )
-    lisa_step_interval: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to switch layers in LISA"},
-    )
-    lisa_layers_attribute: Optional[str] = field(
-        default=None,
-        metadata={"help": "path under the model to access the layers"},
-    )


 class AxolotlTrainer(Trainer):
@@ -227,16 +216,13 @@ class AxolotlTrainer(Trainer):
        num_epochs=1,
        bench_data_collator=None,
        eval_data_collator=None,
-        **kwargs,
+        **kwargs
    ):
        self.num_epochs = num_epochs
        self.bench_data_collator = bench_data_collator
        self.eval_data_collator = eval_data_collator
        super().__init__(*_args, **kwargs)
        self.train_data_collator = self.data_collator
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
-        if self.args.orpo_alpha:
-            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

    def create_optimizer(self):
        if self.args.loraplus_lr_ratio is None:
@@ -246,7 +232,6 @@ class AxolotlTrainer(Trainer):
        if self.optimizer is None:  # pylint: disable=access-member-before-definition
            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(
                self.args,
-                opt_model,
            )

            loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
@@ -480,165 +465,8 @@ class AxolotlTrainer(Trainer):
        #     outputs = model(**inputs)
        #     loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
        #     return (loss, outputs) if return_outputs else loss
-        if self.args.orpo_alpha:
-            return self.orpo_compute_loss(model, inputs, return_outputs=return_outputs)
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

-    @staticmethod
-    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
-        concatenated_batch = {}
-
-        max_length = max(
-            inputs["input_ids"].shape[1], inputs["rejected_input_ids"].shape[1]
-        )
-        # Concatenate positive and negative inputs
-        concatenated_batch["input_ids"] = pad_to_length(
-            inputs["input_ids"], max_length, pad_token
-        )
-        concatenated_batch["rejected_input_ids"] = pad_to_length(
-            inputs["rejected_input_ids"], max_length, pad_token
-        )
-        concatenated_batch["labels"] = pad_to_length(
-            inputs["labels"], max_length, label_pad_token
-        )
-        concatenated_batch["rejected_labels"] = pad_to_length(
-            inputs["rejected_labels"], max_length, label_pad_token
-        )
-        concatenated_batch["attention_mask"] = pad_to_length(
-            inputs["attention_mask"], max_length, 0
-        )
-        concatenated_batch["rejected_attention_mask"] = pad_to_length(
-            inputs["rejected_attention_mask"], max_length, 0
-        )
-        concatenated_batch["prompt_attention_mask"] = pad_to_length(
-            inputs["prompt_attention_mask"], max_length, 0
-        ).to(device=device)
-
-        input_ids = torch.cat(
-            [concatenated_batch["input_ids"], concatenated_batch["rejected_input_ids"]],
-            dim=0,
-        ).to(device=device)
-        attention_mask = torch.cat(
-            [
-                concatenated_batch["attention_mask"],
-                concatenated_batch["rejected_attention_mask"],
-            ],
-            dim=0,
-        ).to(device=device)
-        labels = torch.cat(
-            [concatenated_batch["labels"], concatenated_batch["rejected_labels"]], dim=0
-        ).to(device=device)
-
-        return {
-            "input_ids": input_ids,
-            "labels": labels,
-            "attention_mask": attention_mask,
-            "prompt_attention_mask": concatenated_batch["prompt_attention_mask"],
-        }
-
-    def orpo_compute_custom_loss(self, logits, labels):
-        logits = logits.contiguous()
-        loss = 0.0
-
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-
-            # Flatten the tokens
-            loss = self.loss_fct(shift_logits.transpose(2, 1), shift_labels).mean(
-                dim=-1
-            )
-
-        return loss
-
-    def orpo_compute_logps(
-        self, prompt_attention_mask, chosen_inputs, chosen_attention_mask, logits
-    ):
-        # Get the shape of chosen_attention_mask[:, :-1]
-        chosen_shape = chosen_attention_mask[:, :-1].shape
-
-        # Calculate the padding size
-        pad_length = chosen_shape[1] - (prompt_attention_mask.shape[1] - 1)
-
-        # Pad prompt_attention_mask with zeros to match the desired shape
-        prompt_attention_mask_padded = torch.nn.functional.pad(
-            prompt_attention_mask[:, 1:], (0, pad_length), mode="constant", value=0
-        )
-
-        # Perform the subtraction operation
-        mask = chosen_attention_mask[:, :-1] > prompt_attention_mask_padded
-
-        per_token_logps = torch.gather(
-            logits[:, :-1, :].log_softmax(-1),
-            dim=2,
-            index=(mask * chosen_inputs[:, 1:]).unsqueeze(2),
-        ).squeeze(2)
-        return torch.mul(per_token_logps, mask).sum(dim=1) / mask.sum(dim=1)
-
-    def orpo_compute_loss(self, model, inputs, return_outputs=False):
-        concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
-            inputs,
-            label_pad_token=-100,
-            pad_token=self.tokenizer.pad_token_id,
-            device=self.accelerator.device,
-        )
-
-        # Perform a single forward pass
-        outputs = model(
-            **{
-                "input_ids": concat_inputs["input_ids"],
-                "attention_mask": concat_inputs["attention_mask"],
-                "labels": concat_inputs["labels"],
-            },
-            output_hidden_states=True,
-        )
-
-        # Split the outputs for positive and negative examples
-        outputs_pos, outputs_neg = outputs.logits.chunk(2)
-
-        # Calculate NLL loss
-        pos_loss = self.orpo_compute_custom_loss(
-            logits=outputs_pos, labels=concat_inputs["input_ids"].chunk(2)[0]
-        )
-
-        # Calculate Log Probability
-        pos_prob = self.orpo_compute_logps(
-            prompt_attention_mask=concat_inputs["prompt_attention_mask"],
-            chosen_inputs=concat_inputs["input_ids"].chunk(2)[0],
-            chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[0],
-            logits=outputs_pos,
-        )
-        neg_prob = self.orpo_compute_logps(
-            prompt_attention_mask=concat_inputs["prompt_attention_mask"],
-            chosen_inputs=concat_inputs["input_ids"].chunk(2)[1],
-            chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[1],
-            logits=outputs_neg,
-        )
-
-        # Calculate log odds
-        log_odds = (pos_prob - neg_prob) - (
-            torch.log(1 - torch.exp(pos_prob)) - torch.log(1 - torch.exp(neg_prob))
-        )
-        sig_ratio = torch.nn.functional.sigmoid(log_odds)
-        ratio = torch.log(sig_ratio)
-
-        # Calculate the Final Loss
-        loss = torch.mean(pos_loss - self.args.orpo_alpha * ratio).to(
-            dtype=torch.bfloat16
-        )
-
-        metrics = {}
-        metrics["chosen_geometric_mean"] = torch.mean(pos_prob).cpu().item()
-        metrics["rejected_geometric_mean"] = torch.mean(neg_prob).cpu().item()
-        metrics["log_odds_ratio"] = torch.mean(ratio).cpu().item()
-        metrics["log_odds"] = torch.mean(log_odds).cpu().item()
-        self.store_metrics(metrics, train_eval="train")
-
-        return (loss, outputs_pos) if return_outputs else loss
-
    @wraps(Trainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
@@ -651,39 +479,54 @@ class AxolotlTrainer(Trainer):

    @wraps(Trainer.create_accelerator_and_postprocess)
    def create_accelerator_and_postprocess(self):
+        rank = int(os.environ.get("LOCAL_RANK", 0))
        res = super().create_accelerator_and_postprocess()

+        if self.args.qlora is False:
+            return res
+
+        # the rest of this method override is specific to fsdp + qlora (for now)
+        sync_module_states = (
+            str_to_bool(os.environ.get("FSDP_SYNC_MODULE_STATES", "True")) == 1
+        )
+
+        mp_policy = None
+        amp = os.environ["ACCELERATE_MIXED_PRECISION"]
+        if amp == "fp16":
+            mp_policy = MixedPrecision(
+                param_dtype=torch.float32,
+                reduce_dtype=torch.float32,
+                buffer_dtype=torch.float32,
+            )
+        elif amp == "bf16":
+            mp_policy = MixedPrecision(
+                param_dtype=torch.float32,
+                reduce_dtype=torch.float32,
+                buffer_dtype=torch.float32,
+            )
+
+        # If somehow we figure out how we want to parameterize we want to autocast buffers...
+        # mp_policy = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, buffer_dtype=torch.float32)
+        # load_param_skip_names = ['inv_freq']
+
        if self.is_fsdp_enabled:
-            if (
-                "limit_all_gathers" in self.args.fsdp_config
-                and self.args.fsdp_config["limit_all_gathers"]
-            ):
-                self.accelerator.state.fsdp_plugin.limit_all_gathers = True
+            wrapping_policy = get_wrapping_policy_factory(self.args.model_type)
+            fsdp_plugin = FullyShardedDataParallelPlugin(
+                auto_wrap_policy=wrapping_policy(),
+                cpu_offload=False,
+                use_orig_params=False,
+                limit_all_gathers=True,
+                param_init_fn=lambda module: module.to_empty(
+                    device=torch.device("cuda"), recurse=False
+                )
+                if (rank != 0 and sync_module_states)
+                else None,
+                mixed_precision_policy=mp_policy,
+            )
+            self.accelerator.state.fsdp_plugin = fsdp_plugin

        return res

-    def log(self, logs: Dict[str, float]) -> None:
-        """
-        Log `logs` on the various objects watching training, including stored metrics.
-
-        Args:
-            logs (`Dict[str, float]`):
-                The values to log.
-        """
-        # logs either has 'loss' or 'eval_loss'
-        train_eval = "train" if "loss" in logs else "eval"
-        # Add averaged stored metrics to logs
-        for key, metrics in self._stored_metrics[train_eval].items():
-            logs[key] = torch.tensor(metrics).mean().item()
-        del self._stored_metrics[train_eval]
-        return super().log(logs)
-
-    def store_metrics(
-        self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train"
-    ) -> None:
-        for key, value in metrics.items():
-            self._stored_metrics[train_eval][key].append(value)
-

 class AxolotlMambaTrainer(AxolotlTrainer):
    """
@@ -800,15 +643,6 @@ class AxolotlDPOTrainer(DPOTrainer):

        return super().push_to_hub(*args, **kwargs)

-    def tokenize_row(
-        self, feature, model: Optional[Union[PreTrainedModel, torch.nn.Module]] = None
-    ) -> Dict:
-        res = super().tokenize_row(feature, model=model)
-        if self.tokenizer.bos_token_id is None and res["prompt_input_ids"][0] is None:
-            for key in res.keys():
-                res[key] = res[key][1:]
-        return res
-

 class TrainerBuilderBase(abc.ABC):
    """
@@ -825,12 +659,6 @@ class TrainerBuilderBase(abc.ABC):
        self.model = model
        self.tokenizer = tokenizer

-        # in case the model supports tagging, add the axolotl tag.
-        # This makes sure the tag is correctly pushed even if a user calls
-        # model.push_to_hub instad of  trainer.push_to_hub.
-        if hasattr(model, "add_model_tags"):
-            model.add_model_tags(["axolotl"])
-
    @property
    def model_ref(self):
        return self._model_ref
@@ -940,16 +768,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        callbacks = []
        if self.cfg.use_wandb and self.cfg.eval_table_size > 0:
            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "wandb"
-            )
-            callbacks.append(LogPredictionCallback(self.cfg))
-        if (
-            self.cfg.use_mlflow
-            and is_mlflow_available()
-            and self.cfg.eval_table_size > 0
-        ):
-            LogPredictionCallback = log_prediction_callback_factory(
-                trainer, self.tokenizer, "mlflow"
+                trainer, self.tokenizer
            )
            callbacks.append(LogPredictionCallback(self.cfg))

@@ -967,8 +786,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            )
            callbacks.append(early_stop_cb)

-        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
-            callbacks.append(lisa_callback_factory(trainer))
        return callbacks

    def _get_trainer_cls(self):
@@ -1020,6 +837,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                training_arguments_kwargs[
                    "gradient_checkpointing_kwargs"
                ] = self.cfg.gradient_checkpointing_kwargs
+            else:
+                training_arguments_kwargs["gradient_checkpointing_kwargs"] = {
+                    "use_reentrant": False
+                }
        if self.cfg.fsdp:
            training_arguments_kwargs["fsdp"] = self.cfg.fsdp
            if self.cfg.fsdp_config:
@@ -1058,9 +879,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.save_safetensors is not None:
            training_arguments_kwargs["save_safetensors"] = self.cfg.save_safetensors

-        if self.cfg.save_only_model is not None:
-            training_arguments_kwargs["save_only_model"] = self.cfg.save_only_model
-
        if self.cfg.sample_packing_eff_est:
            training_arguments_kwargs[
                "sample_packing_efficiency"
@@ -1085,11 +903,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
            training_arguments_kwargs["dataloader_drop_last"] = True

-        if self.cfg.remove_unused_columns is not None:
-            training_arguments_kwargs[
-                "remove_unused_columns"
-            ] = self.cfg.remove_unused_columns
-
        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
            # no eval set, so don't eval
            training_arguments_kwargs["evaluation_strategy"] = "no"
@@ -1203,18 +1016,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["optim"] = (
            self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
        )
-        if self.cfg.optim_args:
-            if isinstance(self.cfg.optim_args, dict):
-                optim_args = ",".join(
-                    [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
-                )
-            else:
-                optim_args = self.cfg.optim_args
-            training_arguments_kwargs["optim_args"] = optim_args
-        if self.cfg.optim_target_modules:
-            training_arguments_kwargs[
-                "optim_target_modules"
-            ] = self.cfg.optim_target_modules
        training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio
        training_arguments_kwargs[
            "loraplus_lr_embedding"
@@ -1263,24 +1064,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                    "relora_prune_ratio"
                ] = self.cfg.relora_prune_ratio

-        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
-            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
-            training_arguments_kwargs[
-                "lisa_step_interval"
-            ] = self.cfg.lisa_step_interval
-            training_arguments_kwargs[
-                "lisa_layers_attribute"
-            ] = self.cfg.lisa_layers_attribute
-
        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
        training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)

-        if self.cfg.rl == "orpo":
-            training_arguments_kwargs["orpo_alpha"] = self.cfg.orpo_alpha
-
        if self.cfg.neftune_noise_alpha is not None:
            training_arguments_kwargs[
                "neftune_noise_alpha"
@@ -1344,7 +1133,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            args=training_args,
-            tokenizer=self.tokenizer,
            data_collator=self.build_collator(training_args, **data_collator_kwargs),
            eval_data_collator=self.build_collator(
                training_args, is_eval=True, **data_collator_kwargs
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -284,7 +284,12 @@ def flashattn_forward_with_s2attn(
    # [bsz, nh, q_len, hd]
    # pylint: disable=duplicate-code

-    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(
+        value_states, seq_len=kv_seq_len, position_ids=position_ids
+    )
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
@@ -430,7 +435,13 @@ def flashattn_forward(
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]

-    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(
+        value_states, seq_len=kv_seq_len, position_ids=position_ids
+    )
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -80,7 +80,11 @@ def xformers_forward(
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]

-    cos, sin = self.rotary_emb(value_states)
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
--- a/src/axolotl/monkeypatch/moe/init.py
+++ b/src/axolotl/monkeypatch/moe/init.py
--- a/src/axolotl/monkeypatch/moe/linear.py
+++ b/src/axolotl/monkeypatch/moe/linear.py
@@ -0,0 +1,149 @@
+"""
+Adapted from:
+https://github.com/shawntan/scattermoe
+https://arxiv.org/abs/2403.08245
+"""
+
+import torch
+import torch.nn as nn
+from axolotl.monkeypatch.moe import ops
+
+class ParallelLinear(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, x, expert_weights, k,
+        sorted_expert_idxs, sorted_scattered_idxs,
+        padded_block_idxs, expert_offsets,
+        gates=None, grouped_in=False, grouped_out=False,
+    ):
+
+        output = ops.scatter2scatter(
+            X=x, W=expert_weights,
+            sorted_expert_idxs=sorted_expert_idxs,
+            sorted_scattered_idxs=sorted_scattered_idxs,
+            padded_block_idxs=padded_block_idxs,
+            k=k, x_grouped=grouped_in, y_grouped=grouped_out
+        )
+        if gates is not None:
+            output_expanded = output.view(gates.size(0), gates.size(1), output.size(-1))
+            output = torch.bmm(
+                gates[:, None, :],
+                output_expanded
+            ).squeeze(1)
+        else:
+            output_expanded = None
+
+        ctx.save_for_backward(
+            x, expert_weights,
+            sorted_expert_idxs,
+            sorted_scattered_idxs,
+            padded_block_idxs, expert_offsets,
+            gates,
+            output_expanded
+        )
+        ctx.grouped_in = grouped_in
+        ctx.grouped_out = grouped_out
+        ctx.k = k
+        return output
+    @staticmethod
+    def backward(ctx, grad_out):
+        (x, expert_weights,
+         sorted_expert_idxs,
+         sorted_scattered_idxs,
+         padded_block_idxs, expert_offsets,
+         gates, output_expanded) = ctx.saved_tensors
+        k = ctx.k
+        grouped_in = ctx.grouped_in
+        grouped_out = ctx.grouped_out
+        # print("backward")
+        if gates is not None:
+            # calculate gates gradient
+            d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1)
+            gates_flat = gates.flatten()
+            gate_fan = gates.size(1)
+            # print("expanded and grouping")
+            grouped_grad_out = output_expanded.flatten(0, 1) # reuse expanded buffer later
+        else:
+            d_gates = None
+            gates_flat = None
+            gate_fan = 1
+            grouped_grad_out = None
+
+        if grouped_out:
+            grouped_grad_out = grad_out
+        else:
+            grouped_grad_out = ops.group(grad_out, sorted_scattered_idxs,
+                                                 fan_out=gate_fan, coeff=gates_flat,
+                                                 out=grouped_grad_out)
+        if grouped_in:
+            grouped_x = x
+            d_expanded_input = None
+        else:
+            grouped_x = ops.group(x, sorted_scattered_idxs, fan_out=k)
+            d_expanded_input = grouped_x
+        d_weights = ops.group_bwd_W(
+            DY=grouped_grad_out, X=grouped_x,
+            expert_offsets=expert_offsets,
+            E=expert_weights.size(0)
+        )
+        d_expanded_input = ops.scatter2scatter(
+            X=grouped_grad_out, x_grouped=True,
+            W=expert_weights.permute(0, 2, 1),
+            padded_block_idxs=padded_block_idxs,
+            sorted_expert_idxs=sorted_expert_idxs,
+            sorted_scattered_idxs=sorted_scattered_idxs,
+            k=1,
+            y_grouped=grouped_in,
+            out=d_expanded_input # Reuse grouped_x buffer
+        )
+
+        if k == 1:
+            d_input = d_expanded_input
+        else:
+            d_input = d_expanded_input.view(x.size(0), k, d_expanded_input.size(-1)).sum(-2)
+        # print("backward end.")
+        return (
+            # x, expert_weights, k,
+            d_input, d_weights, None,
+            # sorted_expert_idxs, sorted_scattered_idxs,
+            None, None,
+            # padded_block_idxs, expert_offsets,
+            None, None,
+            # gates
+            d_gates, None, None
+        )
+
+def parallel_linear(inputs, expert_weights, k,
+                    sorted_expert_idxs, sorted_scattered_idxs,
+                    padded_block_idxs, expert_offsets,
+                    gates=None):
+    results = ParallelLinear.apply(inputs, expert_weights, k,
+                                   sorted_expert_idxs, sorted_scattered_idxs,
+                                   padded_block_idxs, expert_offsets, gates)
+    return results
+
+class ParallelExperts(nn.Module):
+    def __init__(self, num_experts, input_size, output_size, device) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, output_size, input_size, device=device)
+        )
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.output_size = output_size
+
+    def extra_repr(self):
+        return 'num_experts={}, input_size={}, output_size={}'.format(
+            self.num_experts, self.input_size, self.output_size)
+
+    def forward(self, inputs, k, sorted_expert_idxs, sorted_scattered_idxs,
+                padded_block_idxs, expert_offsets,
+                gates=None, grouped_in=False, grouped_out=False):
+
+        results = ParallelLinear.apply(
+            inputs, self.weight.permute(0, 2, 1), k,
+            sorted_expert_idxs, sorted_scattered_idxs,
+            padded_block_idxs, expert_offsets,
+            gates, grouped_in, grouped_out
+        )
+        return results
--- a/src/axolotl/monkeypatch/moe/mlp.py
+++ b/src/axolotl/monkeypatch/moe/mlp.py
@@ -0,0 +1,86 @@
+"""
+Adapted from:
+https://github.com/shawntan/scattermoe
+https://arxiv.org/abs/2403.08245
+"""
+
+import gc
+import torch
+from torch import nn
+
+from axolotl.monkeypatch.moe import ops
+from axolotl.monkeypatch.moe.linear import ParallelExperts
+
+
+class FusedExperts(nn.Module):
+    def __init__(
+        self,
+        experts: nn.ModuleList =None,
+        hidden_dim=128,
+        ffn_dim=512,
+        num_experts=8,
+        top_k=2,
+        activation=nn.SiLU(),
+    ):
+        """
+        This implements fused experts that are compatible with Mixtral.
+        MLP of type Gated-Linear Unit, typically with a SiLU activation function.
+        """
+        super(FusedExperts, self).__init__()
+
+        device = experts[0].w1.weight.device
+        self.num_experts = num_experts
+        self.hidden_dim = hidden_dim
+        self.ffn_dim = ffn_dim
+        self.experts = ParallelExperts(num_experts, hidden_dim, 2 * ffn_dim, device=device)
+        self.output_experts = ParallelExperts(num_experts, ffn_dim, hidden_dim, device=device)
+        self.top_k = min(top_k, self.num_experts)
+        self.activation = activation
+
+        with torch.no_grad():
+            for i in range(len(experts)):
+                self.experts.weight.data[i].copy_(
+                    torch.cat(
+                        [experts[i].w1.weight.detach(), experts[i].w3.weight.detach()],
+                        dim=0
+                    )
+                )
+                self.output_experts.weight.data[i].copy_(
+                    experts[i].w2.weight.detach()
+                )
+
+    def forward(
+        self, x: torch.Tensor, routing_weights: torch.Tensor, selected_experts: torch.Tensor
+    ):
+        x_shape = x.size()
+        x = x.view(-1, x_shape[-1])
+        with torch.no_grad():
+            sorted_expert_idxs, sorted_scattered_idxs = ops.flatten_and_sort(
+                selected_experts
+            )
+            padded_block_idxs, expert_offsets = ops.padded_block_indices(
+                sorted_expert_idxs, self.num_experts
+            )
+
+        h, gates = self.experts(
+            x,
+            self.top_k,
+            sorted_expert_idxs,
+            sorted_scattered_idxs,
+            padded_block_idxs,
+            expert_offsets,
+            grouped_out=True,
+        ).chunk(2, dim=-1)
+        h = self.activation(gates) * h
+        y = self.output_experts(
+            h,
+            1,
+            sorted_expert_idxs,
+            sorted_scattered_idxs,
+            padded_block_idxs,
+            expert_offsets,
+            grouped_in=True,
+            gates=routing_weights,
+        )
+        y = y.view(*x_shape[:-1], y.size(-1))
+        return y
--- a/src/axolotl/monkeypatch/moe/moe.py
+++ b/src/axolotl/monkeypatch/moe/moe.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from axolotl.monkeypatch.moe.mlp import FusedExperts
+
+class SparseMoeBlock(nn.Module):
+    def __init__(self, experts, gate, hidden_dim, ffn_dim, num_experts, top_k):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.ffn_dim = ffn_dim
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.gate = gate
+        self.experts = FusedExperts(
+            experts=experts,
+            hidden_dim=hidden_dim,
+            ffn_dim=ffn_dim,
+            num_experts=num_experts,
+            top_k=top_k,
+            activation=experts[0].act_fn
+        )
+
+    def _post_training(self, model, name):
+        # get original weights back: reverse the concat + stack in the fused experts
+        w1s, w3s = torch.split(torch.unbind(self.experts.experts.weight, dim=0), 2, dim=1)
+        w2s = torch.unbind(self.experts.output_experts.weight, dim=0)
+
+        # TODO: recreate MoE class with original weights
+        experts = []
+        for i in range(self.num_experts):
+            pass
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        # Fused expert forward
+        final_hidden_states = self.experts(hidden_states, routing_weights, selected_experts)
+
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
--- a/src/axolotl/monkeypatch/moe/ops.py
+++ b/src/axolotl/monkeypatch/moe/ops.py
@@ -0,0 +1,353 @@
+"""
+Adapted from:
+https://github.com/shawntan/scattermoe
+https://arxiv.org/abs/2403.08245
+"""
+
+import torch
+import triton
+import triton.language as tl
+from torch.nn import functional as F
+
+BLOCK_M = 128
+
+@torch.jit.script
+def flatten_and_sort(expert_idxs:torch.Tensor):
+    flattened_expert_idxs = expert_idxs.flatten()
+    sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs)
+    return sorted_expert_idxs, sorted_scattered_idxs
+
+@torch.jit.script
+def padded_block_indices(sorted_experts_idxs: torch.Tensor, k: int, N_BLOCK_SIZE: int=BLOCK_M) :
+    expert_counts = torch.bincount(sorted_experts_idxs, minlength=k)
+    padded_block_counts = ((expert_counts - 1) // N_BLOCK_SIZE) + 1
+    padded_expert_block_end = padded_block_counts.cumsum(-1)
+    expert_boundaries_end = expert_counts.cumsum(-1)
+    expert_boundaries_start = expert_boundaries_end - expert_counts
+    padded_expert_block_start = padded_expert_block_end - padded_block_counts
+    block_idxs = torch.arange(padded_expert_block_end[-1],
+                              dtype=sorted_experts_idxs.dtype,
+                              device=sorted_experts_idxs.device)
+    block_mask = (
+        (block_idxs[:, None] < padded_expert_block_start) |
+        (block_idxs[:, None] >= padded_expert_block_end)
+    )
+    expanded_block_idxs = (
+        N_BLOCK_SIZE * (block_idxs[:, None] - padded_expert_block_start) +
+        expert_boundaries_start
+    )
+    expanded_block_idxs = expanded_block_idxs.masked_fill(block_mask, 0).sum(-1)
+    return expanded_block_idxs, expert_boundaries_end
+
+
+
+def _scatter2scatter_configs():
+    return [
+        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
+    ]
+
+@triton.autotune(configs=_scatter2scatter_configs(), key=['M', 'N', 'K'], )
+@triton.heuristics({
+    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
+    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
+})
+@triton.jit
+def _scatter2scatter(
+    X_ptr, stride_xm, stride_xk,
+    W_ptr, stride_we, stride_wk, stride_wn,
+    Y_ptr, stride_ym, stride_yn,
+    grouped_idx_ptr, expert_idxs_ptr, block_start_idx_ptr,
+    FAN_OUT: tl.constexpr,
+    M: tl.constexpr, K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
+    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    ACC_TYPE: tl.constexpr,
+    OUT_M: tl.constexpr,
+    allow_tf32: tl.constexpr,
+    x_grouped: tl.constexpr, y_grouped: tl.constexpr,
+    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
+):
+    pid = tl.program_id(axis=0)
+
+    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
+    M_block_id = pid // N_BLOCK_COUNT
+    N_block_id = pid % N_BLOCK_COUNT
+    M_range = tl.arange(0, BLOCK_M)
+    block_start_idx = tl.load(block_start_idx_ptr + M_block_id)
+    # M_block = tl.max_contiguous((block_start_idx + M_range) % OUT_M, BLOCK_M)
+    M_block = tl.max_contiguous(block_start_idx + M_range, BLOCK_M)
+    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_block < (FAN_OUT * M), other=E)
+    E_idx = tl.min(E_idxs)
+    E_mask = E_idxs == E_idx
+    M_idx = tl.load(grouped_idx_ptr + M_block, mask=E_mask, other=0)
+    if x_grouped:
+        M_in_idx = M_block
+    else:
+        M_in_idx = M_idx // FAN_OUT
+
+    if y_grouped:
+        M_out_idx = M_block
+    else:
+        M_out_idx = M_idx
+
+    K_block = tl.arange(0, BLOCK_K)
+
+    N_block = N_block_id * BLOCK_N  + tl.arange(0, BLOCK_N)
+    N_mask = N_block < N
+    # N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
+    # N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
+
+    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
+    W_blk_ptrs = W_ptr + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn + E_idx * stride_we
+
+    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
+    iters = tl.cdiv(K, BLOCK_K)
+    for K_block_id in range(0, iters):
+        if NO_K_MASK:
+            x = tl.load(X_blk_ptrs, mask=E_mask[:, None])
+            if NO_N_MASK:
+                w = tl.load(W_blk_ptrs)
+            else:
+                w = tl.load(W_blk_ptrs, mask=N_mask[None, :])
+        else:
+            K_mask = (K_block_id * BLOCK_K + K_block) < K
+            x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :])
+            w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :])
+        X_blk_ptrs += BLOCK_K * stride_xk
+        W_blk_ptrs += BLOCK_K * stride_wk
+        acc += tl.dot(x, w, allow_tf32=allow_tf32, out_dtype=ACC_TYPE)
+
+    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
+    tl.store(Y_blk_ptrs, acc, mask=E_mask[:, None] & N_mask[None, :])
+
+def scatter2scatter(X, W, sorted_expert_idxs, sorted_scattered_idxs, k,
+                    padded_block_idxs, x_grouped=False, y_grouped=False,
+                    out=None):
+    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
+    assert sorted_scattered_idxs.size(0) == X.size(0) * k
+    # Pre-kernel setup
+    x_dim = X.size(-1)
+    y_dim = W.size(-1)
+    L_scattered = sorted_expert_idxs.size(0)
+    if out is None:
+        O = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype)
+    else:
+        assert out.size(0) == L_scattered and out.size(1) == y_dim
+        O = out
+
+    def grid(META):
+        grid_num = (
+            padded_block_idxs.size(0) *
+            triton.cdiv(META['N'], META['BLOCK_N']),
+        )
+        return grid_num
+    """
+    print("X", X.size(), X.stride(),
+          "W", W.size(), W.stride(),
+          "O", O.size(), O.stride(),
+          "sorted_idxs", sorted_scattered_idxs.size(),
+          "FAN_OUT", k,
+          "BLOCK_M", BLOCK_M,
+          "grouped", (x_grouped, y_grouped))
+    """
+    _scatter2scatter[grid](
+        # X_ptr, stride_xm, stride_xk,
+        X, X.stride(0), X.stride(1),
+        # W_ptr, stride_we, stride_wk, stride_wn,
+        W, W.stride(0), W.stride(1), W.stride(2),
+        # Y_ptr, stride_ym, stride_yn,
+        O, O.stride(0), O.stride(1),
+        grouped_idx_ptr=sorted_scattered_idxs,
+        expert_idxs_ptr=sorted_expert_idxs,
+        block_start_idx_ptr=padded_block_idxs,
+        FAN_OUT=k,
+        M=X.size(0),
+        K=X.size(1),
+        N=O.size(1), E=W.size(0),
+        BLOCK_M=BLOCK_M,
+        ACC_TYPE=tl.float32,
+        OUT_M=O.size(0),
+        allow_tf32=True,
+        x_grouped=x_grouped, y_grouped=y_grouped,
+    )
+    return O
+
+
+def _config_XtY():
+    return [
+        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 128, 'BLOCK_M': 32}, num_stages=4, num_warps=4),
+    ]
+
+def group_bwd_W(DY, X, expert_offsets, E):
+    DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype)
+    DW = DWt.permute(0, 2, 1)
+    def grid(META):
+        grid = (
+            E * triton.cdiv(META['K'], META['BLOCK_K']),
+            triton.cdiv(META['N'], META['BLOCK_N']),
+        )
+        return grid
+    _groupXtY[grid](
+        # DY_ptr, stride_dym, stride_dyk,
+        DY, DY.stride(0), DY.stride(1),
+        # X_ptr, stride_xm, stride_xn,
+        X, X.stride(0), X.stride(1),
+        # DW_ptr, stride_dwe, stride_dwk, stride_dwn,
+        DW, DW.stride(0), DW.stride(1), DW.stride(2),
+        # expert_offsets_ptr,
+        expert_offsets,
+        # K: tl.constexpr, N: tl.constexpr,
+        M=DY.size(0), N=DY.size(-1), K=X.size(-1),
+        # ACC_TYPE: tl.constexpr,
+        ACC_TYPE=tl.float32,
+        allow_tf32=True
+    )
+    return DW
+
+@triton.autotune(configs=_config_XtY(), key=['M', 'N', 'K'], )
+@triton.heuristics({
+    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0,
+    "NO_N_MASK": lambda args: (args['N'] % args['BLOCK_N']) == 0,
+})
+@triton.jit
+def _groupXtY(
+    DY_ptr, stride_dym, stride_dyk,
+    X_ptr, stride_xm, stride_xn,
+    DW_ptr, stride_dwe, stride_dwk, stride_dwn,
+    expert_offsets_ptr,
+    M: tl.constexpr, K: tl.constexpr, N: tl.constexpr,
+    BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    ACC_TYPE: tl.constexpr,
+    allow_tf32: tl.constexpr,
+    NO_K_MASK: tl.constexpr, NO_N_MASK: tl.constexpr
+):
+    pid0 = tl.program_id(axis=0)
+    pid1 = tl.program_id(axis=1)
+    num0 = tl.num_programs(0)
+    num1 = tl.num_programs(1)
+    pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128)
+
+    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
+    E_idx = pid0 // K_BLOCK_COUNT
+    K_block_id = pid0 % K_BLOCK_COUNT
+    N_block_id = pid1
+
+    if E_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
+    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
+
+    if end_idx > start_idx:
+        M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M)
+
+        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
+        K_mask = K_block < K
+        K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K)
+
+        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
+        N_mask = N_block < N
+        N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)
+
+        M_idxs = M_block
+        xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm
+        dy_blk_ptrs = DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk
+
+        acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE)
+        iters = tl.cdiv(end_idx - start_idx, BLOCK_M)
+        for i in range(0, iters):
+            M_mask = (i * BLOCK_M + M_block) < end_idx
+            if NO_K_MASK:
+                xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :])
+            else:
+                xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :])
+            if NO_N_MASK:
+                dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None])
+            else:
+                dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :])
+            acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32)
+            xt_blk_ptrs += BLOCK_M * stride_xm
+            dy_blk_ptrs += BLOCK_M * stride_dym
+
+
+        DW_blk_ptrs = DW_ptr + E_idx * stride_dwe + K_block[:, None] * stride_dwk + N_block[None, :] * stride_dwn
+        acc = acc.to(DW_blk_ptrs.dtype.element_ty)
+        tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :])
+
+
+def _config_grouping():
+    return [
+        triton.Config({'BLOCK_N': 256, 'BLOCK_K': 128}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
+    ]
+
+def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None):
+    N = sorted_expert_idxs.size(0)
+    K = A.size(1)
+    assert A.size(0) * fan_out == N
+    if out is not None:
+        Y = out
+    else:
+        Y = torch.empty((N, K), dtype=A.dtype, device=A.device)
+        # print("grp init:", Y.size())
+    def grid(META):
+        grid_num = (triton.cdiv(META['N'], META['BLOCK_N']),)
+        return grid_num
+    _group[grid](
+        # A_ptr, stride_an, stride_ai,
+        A, A.stride(0), A.stride(1), coeff is not None, coeff, fan_out,
+        # Y_ptr, stride_yn, stride_yk,
+        Y, Y.stride(0), Y.stride(1),
+        # grouped_idx_ptr,
+        sorted_expert_idxs,
+        # N: tl.constexpr, K: tl.constexpr,
+        N, K
+    )
+    return Y
+
+@triton.autotune(configs=_config_grouping(), key=['K'])
+@triton.heuristics({
+    "NO_K_MASK": lambda args: (args['K'] % args['BLOCK_K']) == 0
+})
+@triton.jit
+def _group(
+    src_ptr, stride_sn, stride_sk, has_coeff: tl.constexpr, coeff_ptr, FAN_OUT: tl.constexpr,
+    tgt_ptr, stride_tn, stride_ti,
+    grouped_idx_ptr,
+    N: tl.constexpr, K: tl.constexpr,
+    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    NO_K_MASK: tl.constexpr
+):
+    pid = tl.program_id(axis=0)
+
+    N_block_id = pid
+    N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
+    N_mask = N_blk < N
+    N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N)
+    N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0)
+
+    K_blk = tl.arange(0, BLOCK_K)
+    src_blk_ptrs = src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk
+    tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti
+
+    if has_coeff:
+        c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None]
+
+    iters = tl.cdiv(K, BLOCK_K)
+    for i in range(0, iters):
+        if NO_K_MASK:
+            block = tl.load(src_blk_ptrs) # , mask=N_mask[:, None])
+            if has_coeff:
+                block *= c
+            tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None])
+
+        else:
+            K_mask = (i * BLOCK_K + K_blk) < K
+            mask = N_mask[:, None] & K_mask[None, :]
+            block = tl.load(src_blk_ptrs, mask=mask)
+            if has_coeff:
+                block *= c
+            tl.store(tgt_blk_ptrs, block, mask=mask)
+
+        src_blk_ptrs += BLOCK_K * stride_sk
+        tgt_blk_ptrs += BLOCK_K * stride_ti
--- a/src/axolotl/monkeypatch/moe/single.py
+++ b/src/axolotl/monkeypatch/moe/single.py
@@ -0,0 +1,66 @@
+"""
+Adapted from:
+https://github.com/shawntan/scattermoe
+https://arxiv.org/abs/2403.08245
+"""
+
+import torch
+import triton
+import triton.language as tl
+from torch.nn import functional as F
+
+@triton.jit
+def _single2scatter(
+    X_ptr, stride_xm, stride_xk,
+    W_ptr, stride_we, stride_wk, stride_wn,
+    Y_ptr, stride_ym, stride_yn,
+    expert_idxs_ptr,
+    FAN_OUT: tl.constexpr,
+    K: tl.constexpr, N: tl.constexpr, E: tl.constexpr,
+    BLOCK_N: tl.constexpr, BLOCK_K: tl.constexpr,
+    ACC_TYPE: tl.constexpr,
+):
+    pid0 = tl.program_id(axis=0)
+    pid1 = tl.program_id(axis=1)
+
+    N_block_id = pid0
+    if FAN_OUT == 1:
+        in_idx = pid1
+    else:
+        in_idx = 0
+    out_idx = pid1
+
+    K_block = tl.arange(0, BLOCK_K)
+    N_block = tl.max_contiguous(tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N), BLOCK_N)
+    E_idx = tl.load(expert_idxs_ptr + pid1)
+    X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk
+    W_blk_ptrs = W_ptr + E_idx * stride_we + K_block[:, None] * stride_wk + N_block[None, :] * stride_wn
+    acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE)
+    for K_block_id in range(0, tl.cdiv(K, BLOCK_K)):
+        x = tl.load(X_blk_ptrs)
+        w = tl.load(W_blk_ptrs)
+        acc += tl.sum(x * w, axis=0)[None, :]
+        X_blk_ptrs += BLOCK_K * stride_xk
+        W_blk_ptrs += BLOCK_K * stride_wk
+    Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn
+    tl.store(Y_blk_ptrs, acc)
+
+def single2scatter(X, W, expert_idxs):
+    E, xdim, ydim = W.size()
+    k = expert_idxs.size(1)
+    assert X.size(0) == k or X.size(0) == 1
+    Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype)
+    BLOCK_N = 128
+    BLOCK_K = 128
+    grid = ydim // BLOCK_N, k
+    _single2scatter[grid](
+        X, X.stride(0), X.stride(1),
+        W, W.stride(0), W.stride(1), W.stride(2),
+        Y, Y.stride(0), Y.stride(1),
+        expert_idxs,
+        FAN_OUT=Y.size(0) // X.size(0),
+        K=xdim, N=ydim, E=E,
+        BLOCK_N=BLOCK_N, BLOCK_K=BLOCK_K,
+        ACC_TYPE=tl.float32
+    )
+    return Y
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -12,7 +12,6 @@ from axolotl.monkeypatch.utils import get_unpad_data
 SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
-    "qwen2_moe",
    "falcon",
    "phi",
    "gemma",
@@ -32,10 +31,6 @@ def patch_for_multipack(model_type, model_name=None):
        transformers.models.qwen2.modeling_qwen2._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
-    elif model_type == "qwen2_moe":
-        transformers.models.qwen2_moe.modeling_qwen2_moe._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
    elif model_type == "falcon":
        transformers.models.falcon.modeling_falcon._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
@@ -53,16 +48,14 @@ def patch_for_multipack(model_type, model_name=None):
            get_unpad_data
        )
    elif model_type == "gemmoe":
-        patch_remote(model_name, ".configuration_gemmoe", ".modeling_gemmoe")
-    elif model_type == "jamba":
-        patch_remote(model_name, ".configuration_jamba", ".modeling_jamba")
-
-
-def patch_remote(model_name, config_name, modeling_name):
-    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-    # we need to load the model here in order for modeling_* to be available
-    with init_empty_weights():
-        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
-    module_name = model_config.__class__.__module__.replace(config_name, modeling_name)
-    modeling_arch = importlib.import_module(module_name)
-    modeling_arch._get_unpad_data = get_unpad_data  # pylint: disable=protected-access
+        model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+        # we need to load the model here in order for modeling_gemmoe to be available
+        with init_empty_weights():
+            AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+        module_name = model_config.__class__.__module__.replace(
+            ".configuration_gemmoe", ".modeling_gemmoe"
+        )
+        modeling_gemmoe = importlib.import_module(module_name)
+        modeling_gemmoe._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
--- a/src/axolotl/prompt_strategies/base.py
+++ b/src/axolotl/prompt_strategies/base.py
@@ -1,20 +0,0 @@
-"""
-module for base dataset transform strategies
-"""
-
-import importlib
-import logging
-
-LOG = logging.getLogger("axolotl")
-
-
-def load(strategy, cfg, module_base=None, **kwargs):
-    try:
-        load_fn = strategy.split(".")[-1]
-        strategy = ".".join(strategy.split(".")[:-1])
-        mod = importlib.import_module(f".{strategy}", module_base)
-        func = getattr(mod, load_fn)
-        return func(cfg, **kwargs)
-    except Exception:  # pylint: disable=broad-exception-caught
-        LOG.warning(f"unable to load strategy {strategy}")
-        return None
--- a/src/axolotl/prompt_strategies/dpo/init.py
+++ b/src/axolotl/prompt_strategies/dpo/init.py
@@ -1,8 +1,20 @@
 """
 module for DPO style dataset transform strategies
 """
-from functools import partial

-from ..base import load as load_base
+import importlib
+import logging

-load = partial(load_base, module_base="axolotl.prompt_strategies.dpo")
+LOG = logging.getLogger("axolotl")
+
+
+def load(strategy, cfg, **kwargs):
+    try:
+        load_fn = strategy.split(".")[-1]
+        strategy = ".".join(strategy.split(".")[:-1])
+        mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies.dpo")
+        func = getattr(mod, load_fn)
+        return func(cfg, **kwargs)
+    except Exception:  # pylint: disable=broad-exception-caught
+        LOG.warning(f"unable to load strategy {strategy}")
+        return None
--- a/src/axolotl/prompt_strategies/orpo/init.py
+++ b/src/axolotl/prompt_strategies/orpo/init.py
@@ -1,9 +0,0 @@
-"""
-module for ORPO style dataset transform strategies
-"""
-
-from functools import partial
-
-from ..base import load as load_base
-
-load = partial(load_base, module="axolotl.prompt_strategies.orpo")
--- a/src/axolotl/prompt_strategies/orpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/orpo/chat_template.py
@@ -1,188 +0,0 @@
-"""chatml prompt tokenization strategy for ORPO"""
-from typing import Any, Dict, Generator, List, Optional, Tuple
-
-from pydantic import BaseModel
-
-from axolotl.prompt_tokenizers import IGNORE_INDEX, PromptTokenizingStrategy
-from axolotl.prompters import Prompter
-from axolotl.utils.chat_templates import chat_templates
-
-
-class Message(BaseModel):
-    """message/turn"""
-
-    role: str
-    content: str
-    label: Optional[bool] = None
-
-
-class MessageList(BaseModel):
-    """conversation"""
-
-    messages: List[Message]
-
-
-def load(
-    tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
-    """
-    chatml transforms for datasets with system, input, chosen, rejected
-    """
-
-    chat_template = chat_templates("chatml")
-    if ds_cfg and "chat_template" in ds_cfg:
-        chat_template = ds_cfg["chat_template"]
-        try:
-            chat_template = chat_templates(chat_template)
-        except ValueError:
-            pass
-    tokenizer.chat_template = chat_template
-
-    return ORPOTokenizingStrategy(
-        ORPOPrompter(chat_template, tokenizer),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-        dataset_parser=ORPODatasetParsingStrategy(),
-    )
-
-
-class ORPODatasetParsingStrategy:
-    """Strategy to parse chosen rejected dataset into messagelist"""
-
-    def get_chosen_conversation_thread(self, prompt) -> MessageList:
-        """Dataset structure mappings"""
-
-        messages: List[Message] = []
-        if system := prompt.get("system", None):
-            messages.append(Message(role="system", content=system, label=False))
-        messages.append(Message(role="user", content=prompt["prompt"], label=False))
-        messages.append(
-            Message(
-                role="assistant", content=prompt["chosen"][1]["content"], label=True
-            )
-        )
-        return MessageList(messages=messages)
-
-    def get_rejected_conversation_thread(self, prompt) -> MessageList:
-        """Dataset structure mappings"""
-
-        messages: List[Message] = []
-        if system := prompt.get("system", None):
-            messages.append(Message(role="system", content=system, label=False))
-        messages.append(Message(role="user", content=prompt["prompt"], label=False))
-        messages.append(
-            Message(
-                role="assistant", content=prompt["rejected"][1]["content"], label=True
-            )
-        )
-        return MessageList(messages=messages)
-
-
-class ORPOTokenizingStrategy(PromptTokenizingStrategy):
-    """
-    rejected_input_ids
-    input_ids
-    rejected_attention_mask
-    attention_mask
-    rejected_labels
-    labels
-    """
-
-    def __init__(
-        self,
-        *args,
-        dataset_parser=None,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.dataset_parser = dataset_parser
-
-    def tokenize_prompt(self, prompt):
-        # pass the rejected prompt/row to the Prompter to get the formatted prompt
-        prompt_len = 0
-        rejected_message_list = self.dataset_parser.get_rejected_conversation_thread(
-            prompt
-        )
-        input_ids = []
-        labels = []
-        for _, (part, label) in enumerate(
-            self.prompter.build_prompt(rejected_message_list)
-        ):
-            if not part:
-                continue
-            _input_ids = self.tokenizer.encode(part, add_special_tokens=False)
-            prev_idx = len(input_ids)
-            input_ids += _input_ids[prev_idx:]
-            if label:
-                labels += input_ids[prev_idx:]
-            else:
-                labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)
-                prompt_len = len(input_ids)
-        # remap the input_ids, attention_mask and labels
-        rejected_input_ids = input_ids
-        rejected_labels = labels
-        # pass the chosen prompt/row to the Prompter to get the formatted prompt
-        chosen_message_list = self.dataset_parser.get_chosen_conversation_thread(prompt)
-        input_ids = []
-        labels = []
-        for _, (part, label) in enumerate(
-            self.prompter.build_prompt(chosen_message_list)
-        ):
-            if not part:
-                continue
-            _input_ids = self.tokenizer.encode(part, add_special_tokens=False)
-            prev_idx = len(input_ids)
-            input_ids += _input_ids[prev_idx:]
-            if label:
-                labels += input_ids[prev_idx:]
-            else:
-                labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)
-
-        return {
-            "rejected_input_ids": rejected_input_ids,
-            "rejected_labels": rejected_labels,
-            "rejected_attention_mask": [1] * len(rejected_labels),
-            "input_ids": input_ids,
-            "labels": labels,
-            "attention_mask": [1] * len(labels),
-            "prompt_attention_mask": [1] * prompt_len
-            + [0] * (len(labels) - prompt_len),
-        }
-
-
-class ORPOPrompter(Prompter):
-    """Single Turn prompter for ORPO"""
-
-    def __init__(self, chat_template, tokenizer):
-        self.chat_template = chat_template
-        self.tokenizer = tokenizer
-
-    def build_prompt(
-        self,
-        message_list: MessageList,
-    ) -> Generator[Tuple[str, bool], None, None]:
-        conversation = []
-        for message in message_list.messages:
-            conversation.append(message.model_dump())
-            if message.role == "system":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=False,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), False
-            if message.role == "user":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=True,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), False
-            if message.role == "assistant":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=False,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), True
--- a/src/axolotl/prompt_strategies/pretrain.py
+++ b/src/axolotl/prompt_strategies/pretrain.py
@@ -20,11 +20,10 @@ class PretrainTokenizationStrategy(PromptTokenizingStrategy):
    def supports_batched(self):
        return True

-    def __init__(self, *args, max_length=None, text_column="text", **kwargs):
+    def __init__(self, *args, max_length=None, **kwargs):
        super().__init__(*args, **kwargs)
        if max_length:
            self.max_length = max_length
-        self.text_column = text_column

    def _tokenize(
        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
@@ -45,7 +44,7 @@ class PretrainTokenizationStrategy(PromptTokenizingStrategy):
        return res

    def tokenize_prompt(self, prompt):
-        return self._tokenize(prompt[self.text_column])
+        return self._tokenize(prompt["text"])


 def load(tokenizer, cfg):
@@ -54,7 +53,6 @@ def load(tokenizer, cfg):
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
-        text_column=cfg.pretraining_dataset[0]["text_column"] or "text",
        max_length=cfg.sequence_len * 64,
    )
    return strat
--- a/src/axolotl/prompt_strategies/sharegpt.py
+++ b/src/axolotl/prompt_strategies/sharegpt.py
@@ -1,6 +1,5 @@
 """Module containing the SimpleShareGPTPromptTokenizingStrategy class"""

-import logging
 from typing import Any, Dict, Optional

 from fastchat.conversation import Conversation, SeparatorStyle, register_conv_template
@@ -12,8 +11,6 @@ from axolotl.utils.tokenization import (
    merge_consecutive_messages,
 )

-LOG = logging.getLogger("axolotl")
-

 def register_chatml_template(system_message=None):
    system_message = system_message or "You are a helpful assistant."
@@ -45,13 +42,11 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
    )
    field_human = ds_cfg["field_human"] if ds_cfg and "field_human" in ds_cfg else None
    field_model = ds_cfg["field_model"] if ds_cfg and "field_model" in ds_cfg else None
-    roles = ds_cfg["roles"].to_dict() if ds_cfg and "roles" in ds_cfg else None
    strategy = SimpleShareGPTPromptTokenizingStrategy(
        ShareGPTPrompterV2(
            conversation=conversation,
            role_key_model=field_model,
            role_key_human=field_human,
-            roles=roles,
        ),
        tokenizer,
        cfg.train_on_inputs,
@@ -147,12 +142,7 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
            "system": "system",
        }
        turns = [
-            {
-                "from": (
-                    role_map[t[role_key]] if t[role_key] in role_map else t[role_key]
-                ),
-                "value": t[value_key],
-            }
+            {"from": role_map[t[role_key]], "value": t[value_key]}
            for t in conversations
        ]
        return turns
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -11,7 +11,7 @@ from transformers import BatchEncoding, PreTrainedTokenizer
 from axolotl.monkeypatch.fastchat_conversation_turns import (
    add_get_turns_to_conversation,
 )
-from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
+from axolotl.prompters import IGNORE_TOKEN_ID

 LOG = logging.getLogger("axolotl")

@@ -37,7 +37,7 @@ class PromptTokenizingStrategy(abc.ABC):

    def __init__(
        self,
-        prompter: Prompter,
+        prompter,
        tokenizer,
        train_on_inputs: bool = False,
        sequence_len: int = 2048,
@@ -340,23 +340,6 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
            self.prompter._conversation.copy()  # pylint: disable=protected-access
        )

-        input_roles = {conversation.roles[0]}
-        output_roles = {conversation.roles[1]}
-
-        if len(conversation.roles) == 3:
-            tool_role_label = conversation.roles[2]
-            input_roles.add(tool_role_label)
-
-        # Add roles from the config
-        if self.prompter.roles:
-            if "input" in self.prompter.roles and self.prompter.roles["input"]:
-                for role in self.prompter.roles["input"]:
-                    input_roles.add(role)
-
-            if "output" in self.prompter.roles and self.prompter.roles["output"]:
-                for role in self.prompter.roles["output"]:
-                    output_roles.add(role)
-
        # support for custom roles from the dataset, only useful for vicuna style prompts/roles
        role_remap = []
        if (
@@ -377,18 +360,19 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    LOG.warning(f"expected tuple, got {part}")
                    continue

+                tool_role_label = None
+                if len(conversation.roles) == 3:
+                    (
+                        user_role_label,
+                        assistant_role_label,
+                        tool_role_label,
+                    ) = conversation.roles
+                else:
+                    user_role_label, assistant_role_label = conversation.roles
                role, content = part

                # Uses "in" because role contains extra characters
-                input_turn = any(r.lower() in role.lower() for r in input_roles)
-                output_turn = any(r.lower() in role.lower() for r in output_roles)
-                empty_role = role.strip() == ""
-
-                if not any([input_turn, output_turn, empty_role]):
-                    LOG.warning(f"unhandled role: {role}")
-                    continue
-
-                if input_turn:
+                if user_role_label in role:
                    role = (
                        role.replace(role_remap[0]["from"], role_remap[0]["to"])
                        if role_remap
@@ -408,7 +392,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    else:
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
-                elif output_turn:
+                elif assistant_role_label in role:
                    role = (
                        role.replace(role_remap[1]["from"], role_remap[1]["to"])
                        if role_remap
@@ -439,7 +423,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                        labels[:len_role] = [IGNORE_TOKEN_ID] * min(
                            len_role, len(labels)
                        )
-                elif empty_role:
+                elif role == "":
                    turn = content
                    # this is only ever the first part, should include the bos token and the user query
                    res = self._tokenize(
@@ -450,6 +434,11 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    else:
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                elif tool_role_label and tool_role_label in role:
+                    labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                else:
+                    LOG.warning(f"unhandled role: {role}")
+                    continue

                # pylint: disable=duplicate-code
                result, current_len = parse_tokenized_to_result(
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -259,12 +259,6 @@ SHAREGPT_ASSERTION_FAILED_ROLE = (
    "Role did not alternate between turns (gpt and human). Please check your data."
 )

-CONVERSATION_ROLE_FORMAT = {
-    "chatml": "<|im_start|>{ROLE}",
-    "zephyr": "<|{ROLE}|>",
-    "vicuna_v1.1": "{ROLE}",
-}
-

 class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
    """
@@ -274,9 +268,7 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
    role_key_human = "human"
    role_key_model = "gpt"
    # Optional, only used for tool usage datasets.
-    role_key_tool: Optional[str] = None
-    # Optional, role input/output mapping
-    roles: Optional[dict] = None
+    role_key_tool = None

    def __init__(
        self,
@@ -285,7 +277,6 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
        role_key_human: Optional[str] = None,
        role_key_model: Optional[str] = None,
        role_key_tool: Optional[str] = None,
-        roles: Optional[dict] = None,
    ):
        if conversation:
            if isinstance(conversation, Conversation):
@@ -300,8 +291,6 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
            self.role_key_model = role_key_model
        if role_key_tool:
            self.role_key_tool = role_key_tool
-        if roles:
-            self.roles = roles

    def _build_result(self, source):
        if len(source) < 2:
@@ -333,23 +322,11 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods

        conv.messages = []
        for _, sentence in enumerate(source):
-            from_role = sentence["from"]
-            if from_role in roles:
-                role = roles[from_role]
-            else:
-                if self._conversation.name not in CONVERSATION_ROLE_FORMAT:
-                    raise NotImplementedError(
-                        f"Role ({role}) not in default roles, and {self._conversation.name} does not support role remapping yet."
-                        "Please help us by creating an Issue to add support for this conversation type."
-                    )
-
-                role = CONVERSATION_ROLE_FORMAT[self._conversation.name].format(
-                    ROLE=from_role
-                )
-
-            if len(conv.messages) > 0 and ((role == conv.messages[-1][0])):
+            role = roles[sentence["from"]]
+            if len(conv.messages) > 0 and (
+                (role == conv.messages[-1][0]) or (role not in conv.roles)
+            ):
                LOG.warning(f"{SHAREGPT_ASSERTION_FAILED_ROLE}: {sentence}")
-
            conv.append_message(role, sentence["value"])

        return conv.get_turns()
@@ -377,13 +354,11 @@ class ShareGPTPrompterV2(ShareGPTPrompter):
        conversation: Optional[Union[str, Conversation]] = None,
        role_key_human: Optional[str] = None,
        role_key_model: Optional[str] = None,
-        roles: Optional[dict] = None,
    ):
        super().__init__(
            conversation=conversation,
            role_key_human=role_key_human,
            role_key_model=role_key_model,
-            roles=roles,
        )


--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -85,7 +85,7 @@ def train(
    model.generation_config.do_sample = True

    model_ref = None
-    if cfg.rl and cfg.rl != "orpo":
+    if cfg.rl:
        if cfg.adapter and not cfg.rl_adapter_ref_model:
            # use built-in trl autounwrap
            LOG.debug("Passing model_ref: None to RL trainer")
@@ -110,6 +110,9 @@ def train(
        total_num_steps,
    )

+    if hasattr(model, "config"):
+        model.config.use_cache = False
+
    # go ahead and presave, so we have the adapter config available to inspect
    if peft_config:
        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -1,8 +0,0 @@
-"""
-Basic utils for Axolotl
-"""
-import importlib
-
-
-def is_mlflow_available():
-    return importlib.util.find_spec("mlflow") is not None
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -6,7 +6,7 @@ import logging
 import os
 from shutil import copyfile
 from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING, Any, Dict, List
+from typing import TYPE_CHECKING, Dict, List

 import evaluate
 import numpy as np
@@ -27,9 +27,7 @@ from transformers import (
 )
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy

-from axolotl.utils import is_mlflow_available
 from axolotl.utils.bench import log_gpu_memory_usage
-from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
 from axolotl.utils.distributed import (
    barrier,
    broadcast_dict,
@@ -542,7 +540,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
    return CausalLMBenchEvalCallback


-def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
+def log_prediction_callback_factory(trainer: Trainer, tokenizer):
    class LogPredictionCallback(TrainerCallback):
        """Callback to log prediction values during each evaluation"""

@@ -599,13 +597,15 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
                return ranges

            def log_table_from_dataloader(name: str, table_dataloader):
-                table_data: Dict[str, List[Any]] = {
-                    "id": [],
-                    "Prompt": [],
-                    "Correct Completion": [],
-                    "Predicted Completion (model.generate)": [],
-                    "Predicted Completion (trainer.prediction_step)": [],
-                }
+                table = wandb.Table(  # type: ignore[attr-defined]
+                    columns=[
+                        "id",
+                        "Prompt",
+                        "Correct Completion",
+                        "Predicted Completion (model.generate)",
+                        "Predicted Completion (trainer.prediction_step)",
+                    ]
+                )
                row_index = 0

                for batch in tqdm(table_dataloader):
@@ -709,29 +709,16 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
                    ) in zip(
                        prompt_texts, completion_texts, predicted_texts, pred_step_texts
                    ):
-                        table_data["id"].append(row_index)
-                        table_data["Prompt"].append(prompt_text)
-                        table_data["Correct Completion"].append(completion_text)
-                        table_data["Predicted Completion (model.generate)"].append(
-                            prediction_text
+                        table.add_data(
+                            row_index,
+                            prompt_text,
+                            completion_text,
+                            prediction_text,
+                            pred_step_text,
                        )
-                        table_data[
-                            "Predicted Completion (trainer.prediction_step)"
-                        ].append(pred_step_text)
                        row_index += 1
-                if logger == "wandb":
-                    wandb.run.log({f"{name} - Predictions vs Ground Truth": pd.DataFrame(table_data)})  # type: ignore[attr-defined]
-                elif logger == "mlflow" and is_mlflow_available():
-                    import mlflow

-                    tracking_uri = AxolotlInputConfig(
-                        **self.cfg.to_dict()
-                    ).mlflow_tracking_uri
-                    mlflow.log_table(
-                        data=table_data,
-                        artifact_file="PredictionsVsGroundTruth.json",
-                        tracking_uri=tracking_uri,
-                    )
+                wandb.run.log({f"{name} - Predictions vs Ground Truth": table})  # type: ignore[attr-defined]

            if is_main_process():
                log_table_from_dataloader("Eval", eval_dataloader)
@@ -761,11 +748,6 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
                ) as temp_file:
                    copyfile(self.axolotl_config_path, temp_file.name)
-                    artifact = wandb.Artifact(
-                        f"config-{wandb.run.id}", type="axolotl-config"
-                    )
-                    artifact.add_file(temp_file.name)
-                    wandb.log_artifact(artifact)
                    wandb.save(temp_file.name)
                LOG.info(
                    "The Axolotl config has been saved to the WandB run under files."
--- a/src/axolotl/utils/callbacks/lisa.py
+++ b/src/axolotl/utils/callbacks/lisa.py
@@ -1,91 +0,0 @@
-"""
-module for LISA
-
-Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl
-Arxiv: https://arxiv.org/abs/2403.17919
-License: Apache 2.0
-"""
-
-import logging
-from functools import reduce
-from typing import TYPE_CHECKING
-
-import numpy as np
-from transformers import TrainerCallback
-
-if TYPE_CHECKING:
-    from axolotl.core.trainer_builder import AxolotlTrainer
-
-LOG = logging.getLogger("axolotl.callbacks.lisa")
-
-
-def lisa_callback_factory(trainer: "AxolotlTrainer"):
-    class LISACallback(TrainerCallback):
-        """trainer callback for lisa layer switching"""
-
-        def __init__(
-            self, n_layers, step_interval, trainer, layers_attribute="model.layers"
-        ):
-            super().__init__()
-            self.n_layers = n_layers
-            self.step_interval = step_interval
-            self.layers_attribute = layers_attribute
-            self.trainer = trainer
-
-            reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
-
-            self.total_layers = len(
-                reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
-            )
-            self.active_layers_indices = []
-
-            layers = reduce(
-                getattr, self.layers_attribute.split("."), self.trainer.model
-            )
-            LOG.info(
-                f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers*100/len(layers)}%) every {self.step_interval} steps"
-            )
-
-        def freeze_all_layers(self):
-            layers = reduce(
-                getattr, self.layers_attribute.split("."), self.trainer.model
-            )
-            for layer in layers:
-                for param in layer.parameters():
-                    param.requires_grad = False
-
-        def on_step_begin(
-            self, args, state, control, **kwargs
-        ):  # pylint: disable=unused-argument
-            # Check if it's time to switch active layers, including at step 0
-            if state.global_step % self.step_interval == 0 or state.global_step == 1:
-                self.switch_active_layers()
-
-        def switch_active_layers(self):
-            # First, disable gradients for all layers
-            self.freeze_all_layers()
-
-            # Randomly select n_layers to activate
-            layers = reduce(
-                getattr, self.layers_attribute.split("."), self.trainer.model
-            )
-            self.active_layers_indices = np.random.choice(
-                range(self.total_layers), self.n_layers, replace=False
-            )
-            LOG.info(
-                f"Activating layers at indices: {self.active_layers_indices} for the next steps."
-            )
-
-            # Enable gradients only for the selected layers
-            for idx in self.active_layers_indices:
-                for param in layers[idx].parameters():
-                    param.requires_grad = True
-
-    lisa_callback = LISACallback(
-        n_layers=trainer.args.lisa_n_layers,
-        step_interval=trainer.args.lisa_step_interval,
-        trainer=trainer,
-        layers_attribute=trainer.args.lisa_layers_attribute,
-    )
-
-    return lisa_callback
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -21,9 +21,8 @@ def chat_templates(user_choice: str):
    templates = {
        "alpaca": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
        "inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # I don't know what this one is called. Used by Mistral/Mixtral.
-        "chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+        "chatml": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
        "gemma": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
-        "cohere": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
    }

    if user_choice in templates:
--- a/src/axolotl/utils/collators.py
+++ b/src/axolotl/utils/collators.py
@@ -217,24 +217,13 @@ class PretrainingBatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    Collator for multipack specific to the using the BatchSampler
    """

-    def __init__(self, *args, multipack_attn=True, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.multipack_attn = multipack_attn
-
    def __call__(self, features, return_tensors=None):
        chunked_data = {}
        for feature in features.keys():
            if feature == "length":
                continue
            if feature == "attention_mask":
-                if self.multipack_attn:
-                    arrays = [
-                        (i + 1) * np.array(item[feature])
-                        for i, item in enumerate(features[feature])
-                        if feature in item
-                    ]
-                else:
-                    arrays = [(1) * np.array(item) for item in features[feature]]
+                arrays = [(1) * np.array(item) for item in features[feature]]
                chunked_data[feature] = np.concatenate(arrays)
            else:
                arrays = [np.array(item) for item in features[feature]]
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -119,10 +119,6 @@ def normalize_config(cfg):
    model_config = load_model_config(cfg)
    cfg.model_config_type = model_config.model_type

-    cfg.tokenizer_config = (
-        cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
-    )
-
    # figure out if the model is llama
    cfg.is_llama_derived_model = (
        (hasattr(model_config, "model_type") and model_config.model_type == "llama")
@@ -195,11 +191,6 @@ def normalize_cfg_datasets(cfg):
                        f"updating dataset {ds_cfg.path} with `conversation: chatml` to match your chat_template"
                    )
                    cfg.datasets[idx].conversation = "chatml"
-                if ds_cfg.type == "orpo.chat_template" and not ds_cfg.chat_template:
-                    LOG.info(
-                        f"updating dataset {ds_cfg.path} with `chat_template: chatml` to match your chat_template"
-                    )
-                    cfg.datasets[idx].chat_template = "chatml"


 def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
@@ -208,11 +199,11 @@ def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
            dict(
                AxolotlConfigWCapabilities(
                    **cfg.to_dict(), capabilities=capabilities
-                ).model_dump(exclude_none=True)
+                ).model_dump(exclude_unset=True)
            )
        )
    return DictDefault(
-        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True))
+        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_unset=True))
    )


--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -1,13 +1,12 @@
 """
 Module for pydantic models for configuration
 """
-
 # pylint: disable=too-many-lines

 import logging
 import os
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from typing import Any, Dict, List, Literal, Optional, Union

 from pydantic import BaseModel, Field, conlist, field_validator, model_validator
 from transformers import SchedulerType
@@ -62,11 +61,7 @@ class RemappedParameters(BaseModel):
 class PretrainingDataset(BaseModel):
    """pretraining dataset configuration subset"""

-    name: Optional[str] = None
    path: Optional[str] = None
-    split: Optional[str] = "train"
-    text_column: Optional[str] = "text"
-    type: Optional[str] = "pretrain"


 class UserDefinedPrompterType(BaseModel):
@@ -98,12 +93,9 @@ class SFTDataset(BaseModel):
    ds_type: Optional[str] = None
    train_on_split: Optional[str] = None

-    field: Optional[str] = None
    field_human: Optional[str] = None
    field_model: Optional[str] = None

-    roles: Optional[Dict[str, List[str]]] = None
-

 class UserDefinedDPOType(BaseModel):
    """User defined typing for DPO"""
@@ -132,7 +124,6 @@ class RLType(str, Enum):
    dpo = "dpo"  # pylint: disable=invalid-name
    ipo = "ipo"  # pylint: disable=invalid-name
    kto_pair = "kto_pair"  # pylint: disable=invalid-name
-    orpo = "orpo"  # pylint: disable=invalid-name


 class ChatTemplate(str, Enum):
@@ -142,7 +133,6 @@ class ChatTemplate(str, Enum):
    chatml = "chatml"  # pylint: disable=invalid-name
    inst = "inst"  # pylint: disable=invalid-name
    gemma = "gemma"  # pylint: disable=invalid-name
-    cohere = "cohere"  # pylint: disable=invalid-name


 class LoftQConfig(BaseModel):
@@ -158,6 +148,12 @@ class PeftConfig(BaseModel):
    loftq_config: Optional[LoftQConfig] = None


+class AutoType(str, Enum):
+    """auto type string configuration subset - used for bf16"""
+
+    AUTO = "auto"
+
+
 class SpecialTokensConfig(BaseModel):
    """Special tokens configuration subset"""

@@ -186,8 +182,7 @@ class LoraConfig(BaseModel):
    peft_layers_to_transform: Optional[List[int]] = None
    peft: Optional[PeftConfig] = None
    peft_use_dora: Optional[bool] = None
-    peft_use_rslora: Optional[bool] = None
-    peft_layer_replication: Optional[List[Tuple[int, int]]] = None
+    peft_use_relora: Optional[bool] = None

    lora_on_cpu: Optional[bool] = None
    gptq: Optional[bool] = None
@@ -243,6 +238,17 @@ class LoraConfig(BaseModel):
                    raise ValueError("Require cfg.load_in_4bit to be True for qlora")
        return self

+    @model_validator(mode="before")
+    @classmethod
+    def validate_quantized_dora(cls, data):
+        if data.get("peft_use_dora") and (
+            data.get("load_in_8bit") or data.get("load_in_4bit")
+        ):
+            raise ValueError(
+                "`peft_use_dora` is not currently compatible with quantized weights."
+            )
+        return data
+

 class ReLoRAConfig(BaseModel):
    """ReLoRA configuration subset"""
@@ -298,25 +304,14 @@ class HyperparametersConfig(BaseModel):
        },
    )

-    train_on_inputs: Optional[bool] = False
+    train_on_inputs: Optional[bool] = None
    group_by_length: Optional[bool] = None

    learning_rate: Union[str, float]
-    weight_decay: Optional[float] = 0.0
-    optimizer: Optional[
-        Union[OptimizerNames, Literal["lion_pytorch"]]
-    ] = OptimizerNames.ADAMW_HF.value
-    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
-        default=None, metadata={"help": "Optional arguments to supply to optimizer."}
-    )
-    optim_target_modules: Optional[Union[List[str], Literal["all_linear"]]] = Field(
-        default=None,
-        metadata={
-            "help": "The target modules to optimize, i.e. the module names that you would like to train."
-        },
-    )
+    weight_decay: Optional[float] = None
+    optimizer: Optional[Union[OptimizerNames, Literal["lion_pytorch"]]] = None
    torchdistx_path: Optional[str] = None
-    lr_scheduler: Optional[SchedulerType] = "cosine"
+    lr_scheduler: Optional[SchedulerType] = None
    lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
    lr_quadratic_warmup: Optional[bool] = None
    cosine_min_lr_ratio: Optional[float] = None
@@ -355,7 +350,6 @@ class ModelOutputConfig(BaseModel):
    hub_model_id: Optional[str] = None
    hub_strategy: Optional[str] = None
    save_safetensors: Optional[bool] = None
-    save_only_model: Optional[bool] = None


 class MLFlowConfig(BaseModel):
@@ -367,23 +361,6 @@ class MLFlowConfig(BaseModel):
    hf_mlflow_log_artifacts: Optional[bool] = None


-class LISAConfig(BaseModel):
-    """LISA options"""
-
-    lisa_n_layers: Optional[int] = Field(
-        default=None,
-        metadata={"help": "the number of activate layers in LISA"},
-    )
-    lisa_step_interval: Optional[int] = Field(
-        default=None,
-        metadata={"help": "how often to switch layers in LISA"},
-    )
-    lisa_layers_attribute: Optional[str] = Field(
-        default="model.layers",
-        metadata={"help": "path under the model to access the layers"},
-    )
-
-
 class WandbConfig(BaseModel):
    """wandb configuration subset"""

@@ -418,7 +395,6 @@ class AxolotlInputConfig(
    HyperparametersConfig,
    WandbConfig,
    MLFlowConfig,
-    LISAConfig,
    RemappedParameters,
    DeprecatedParameters,
    BaseModel,
@@ -439,13 +415,12 @@ class AxolotlInputConfig(

    datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
    test_datasets: Optional[conlist(Union[SFTDataset, DPODataset], min_length=1)] = None  # type: ignore
-    shuffle_merged_datasets: Optional[bool] = True
    dataset_prepared_path: Optional[str] = None
    dataset_shard_num: Optional[int] = None
    dataset_shard_idx: Optional[int] = None

    pretraining_dataset: Optional[  # type: ignore
-        conlist(Union[PretrainingDataset, SFTDataset], min_length=1)
+        conlist(Union[SFTDataset, PretrainingDataset], min_length=1)
    ] = Field(
        default=None, metadata={"help": {"streaming dataset to use for pretraining"}}
    )
@@ -456,8 +431,6 @@ class AxolotlInputConfig(
    dataloader_prefetch_factor: Optional[int] = None
    dataloader_drop_last: Optional[bool] = None

-    remove_unused_columns: Optional[bool] = None
-
    push_dataset_to_hub: Optional[str] = None
    hf_use_auth_token: Optional[bool] = None

@@ -485,7 +458,7 @@ class AxolotlInputConfig(
    loss_watchdog_threshold: Optional[float] = None
    loss_watchdog_patience: Optional[int] = None

-    bf16: Optional[Union[Literal["auto"], bool]] = "auto"
+    bf16: Optional[Union[AutoType, bool]] = AutoType.AUTO
    fp16: Optional[bool] = None
    bfloat16: Optional[bool] = None  # for non-AMP cases
    float16: Optional[bool] = None  # for non-AMP cases
@@ -499,19 +472,11 @@ class AxolotlInputConfig(

    unfrozen_parameters: Optional[List[str]] = None

-    sequence_len: int = Field(default=512)
+    sequence_len: int = Field(default=1024)
    sample_packing: Optional[bool] = None
    eval_sample_packing: Optional[bool] = None
    pad_to_sequence_len: Optional[bool] = None

-    pretrain_multipack_buffer_size: Optional[int] = 10_000
-    pretrain_multipack_attn: Optional[bool] = Field(
-        default=True,
-        metadata={
-            "help": "whether to prevent cross attention for packed sequences during pretraining",
-        },
-    )
-
    xformers_attention: Optional[bool] = None
    sdp_attention: Optional[bool] = None
    s2_attention: Optional[bool] = None
@@ -550,13 +515,10 @@ class AxolotlInputConfig(

    neftune_noise_alpha: Optional[float] = None

-    orpo_alpha: Optional[float] = None
-
    max_memory: Optional[
        Dict[Union[int, Literal["cpu", "disk"]], Union[int, str]]
    ] = None
    gpu_memory_limit: Optional[Union[int, str]] = None
-    low_cpu_mem_usage: Optional[bool] = None

    chat_template: Optional[ChatTemplate] = None
    default_system_message: Optional[str] = None
@@ -569,10 +531,10 @@ class AxolotlInputConfig(
    sample_packing_eff_est: Optional[float] = None
    axolotl_config_path: Optional[str] = None

-    is_falcon_derived_model: Optional[bool] = Field(default=None)
-    is_llama_derived_model: Optional[bool] = Field(default=None)
-    is_mistral_derived_model: Optional[bool] = Field(default=None)
-    is_qwen_derived_model: Optional[bool] = Field(default=None)
+    is_falcon_derived_model: Optional[bool] = Field(default=False)
+    is_llama_derived_model: Optional[bool] = Field(default=False)
+    is_mistral_derived_model: Optional[bool] = Field(default=False)
+    is_qwen_derived_model: Optional[bool] = Field(default=False)

    @field_validator("datasets", mode="before")
    @classmethod
@@ -647,20 +609,6 @@ class AxolotlInputConfig(

        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_sample_packing_wo_flash(cls, data):
-        if (
-            data.get("sample_packing")
-            and not data.get("flash_attention")
-            and not data.get("sdp_attention")
-        ):
-            LOG.warning(
-                "sample_packing without flash_attention or sdp_attention does not handle cross-attention."
-            )
-
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_w_rl(cls, data):
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -1,10 +1,13 @@
-"""data handling specific to SFT"""
-
+"""Module containing data utilities"""
 import functools
+import hashlib
 import logging
+from collections import defaultdict
 from pathlib import Path
-from typing import List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union

+import torch
+import yaml
 from datasets import (
    Dataset,
    DatasetDict,
@@ -14,11 +17,13 @@ from datasets import (
 )
 from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import HFValidationError
+from torch.utils.data import RandomSampler
 from transformers import PreTrainedTokenizerBase

 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.datasets import TokenizedPromptDataset
 from axolotl.prompt_strategies import load
+from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.prompt_tokenizers import (
    AlpacaMultipleChoicePromptTokenizingStrategy,
    AlpacaPromptTokenizingStrategy,
@@ -39,18 +44,26 @@ from axolotl.prompters import (
    SummarizeTLDRPrompter,
    UnsupportedPrompter,
 )
-from axolotl.utils.data.pretraining import wrap_pretraining_dataset
-from axolotl.utils.data.utils import md5
+from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
+from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.trainer import (
    calculate_total_num_steps,
    process_datasets_for_packing,
+    process_pretraining_datasets_for_packing,
 )

 LOG = logging.getLogger("axolotl")


+def md5(to_hash: str, encoding: str = "utf-8") -> str:
+    try:
+        return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
+    except TypeError:
+        return hashlib.md5(to_hash.encode(encoding)).hexdigest()  # nosec
+
+
 def prepare_dataset(cfg, tokenizer):
    prompters = []
    if not cfg.pretraining_dataset:
@@ -68,15 +81,12 @@ def prepare_dataset(cfg, tokenizer):
                )
    else:
        path = cfg.pretraining_dataset
-        split = "train"
        name = None
        if isinstance(cfg.pretraining_dataset, list) and isinstance(
            cfg.pretraining_dataset[0], dict
        ):
            path = cfg.pretraining_dataset[0]["path"]
            name = cfg.pretraining_dataset[0]["name"]
-            if "split" in cfg.pretraining_dataset[0]:
-                split = cfg.pretraining_dataset[0]["split"]

        ds_wrapper_partial = functools.partial(
            get_dataset_wrapper,
@@ -87,14 +97,13 @@ def prepare_dataset(cfg, tokenizer):
        )

        train_dataset = wrap_pretraining_dataset(
-            load_dataset(path, streaming=True, split=split, name=name),
+            load_dataset(path, streaming=True, split="train", name=name),
            tokenizer,
            cfg,
            ds_wrapper_partial,
            max_tokens=cfg.sequence_len,
            batch_size=cfg.micro_batch_size,
            seed=cfg.seed or 42,
-            buffer_size=cfg.pretrain_multipack_buffer_size or 10_000,
        )
        # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
        train_dataset = train_dataset.with_format("torch")
@@ -125,7 +134,7 @@ def load_tokenized_prepared_datasets(
    split="train",
 ) -> Tuple[DatasetDict, List[Prompter]]:
    cfg_datasets = cfg.test_datasets if split == "test" else cfg.datasets
-    tokenizer_name = cfg.tokenizer_config
+    tokenizer_name = tokenizer.__class__.__name__
    ds_hash = str(
        md5(
            (
@@ -168,7 +177,6 @@ def load_tokenized_prepared_datasets(
    except Exception:  # pylint: disable=broad-except # nosec
        pass

-    # pylint: disable=duplicate-code
    if dataset:
        ...
    elif (
@@ -215,7 +223,7 @@ def load_tokenized_prepared_datasets(
                    token=use_auth_token,
                )
                ds_from_hub = True
-            except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
+            except (FileNotFoundError, ConnectionError, HFValidationError):
                pass

            ds_from_cloud = False
@@ -282,17 +290,14 @@ def load_tokenized_prepared_datasets(
            local_path = Path(config_dataset.path)
            if local_path.exists():
                if local_path.is_dir():
-                    if config_dataset.data_files:
-                        ds_type = get_ds_type(config_dataset)
-                        ds = load_dataset(
-                            ds_type,
-                            name=config_dataset.name,
-                            data_files=config_dataset.data_files,
-                            streaming=False,
-                            split=None,
-                        )
-                    else:
-                        ds = load_from_disk(config_dataset.path)
+                    # TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
+                    ds = load_dataset(
+                        config_dataset.path,
+                        name=config_dataset.name,
+                        data_files=config_dataset.data_files,
+                        streaming=False,
+                        split=None,
+                    )
                elif local_path.is_file():
                    ds_type = get_ds_type(config_dataset)

@@ -379,15 +384,14 @@ def load_tokenized_prepared_datasets(
                d_base_type = d_type_split[0]
                d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None

-            if isinstance(ds, DatasetDict):
-                if config_dataset.split and config_dataset.split in ds:
-                    ds = ds[config_dataset.split]
-                elif split in ds:
-                    ds = ds[split]
-                else:
-                    raise ValueError(
-                        f"no {split} split found for dataset {config_dataset.path}, you may specify a split with 'split: `"
-                    )
+            if config_dataset.split and config_dataset.split in ds:
+                ds = ds[config_dataset.split]
+            elif split in ds:
+                ds = ds[split]
+            elif isinstance(ds, DatasetDict):
+                raise ValueError(
+                    f"no {split} split found for dataset {config_dataset.path}, you may specify a split with 'split: `"
+                )

            # support for using a subset of the data
            if config_dataset.shards:
@@ -411,11 +415,8 @@ def load_tokenized_prepared_datasets(
        dataset = concatenate_datasets(datasets)

        if len(datasets) > 1:
-            if cfg.shuffle_merged_datasets:
-                LOG.debug("shuffle merged datasets")
-                dataset = dataset.shuffle(seed=seed)
-            else:
-                LOG.debug("NOT shuffling merged datasets")
+            LOG.info("shuffle merged datasets")
+            dataset = dataset.shuffle(seed=seed)

        dataset, _ = process_datasets_for_packing(cfg, dataset, None)

@@ -679,3 +680,297 @@ def get_dataset_wrapper(
        )

    return dataset_wrapper, dataset_prompter
+
+
+def encode_pretraining(
+    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
+) -> Dict[str, List]:
+    res = tokenizer(
+        examples,
+        truncation=True,
+        max_length=max_tokens - 2,
+        add_special_tokens=True,
+    )
+    # Convert to PyTorch tensors
+    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
+    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
+    new_input_ids = []
+    new_attention_mask = []
+    # Append EOS and PAD tokens to input_ids, and correct attention_mask
+    for i, _ in enumerate(input_ids):
+        input_ids[i] = torch.cat(
+            (
+                input_ids[i],
+                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
+            ),
+            dim=0,
+        )
+        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
+
+    # Concatenate tokens so that their lengths are less than max_tokens
+    buffer_input_ids = torch.tensor([], dtype=torch.long)
+    buffer_attention_mask = torch.tensor([], dtype=torch.long)
+
+    for ids, mask in zip(input_ids, attention_mask):
+        if buffer_input_ids.numel() == max_tokens:
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        else:
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+
+    if buffer_input_ids.numel() > 0:  # for any leftover tokens
+        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+        new_input_ids.append(buffer_input_ids)
+        new_attention_mask.append(buffer_attention_mask)
+
+    ret = {
+        "input_ids": [seq.tolist() for seq in new_input_ids],
+        "labels": [seq.tolist() for seq in new_input_ids],
+        "attention_mask": [seq.tolist() for seq in new_attention_mask],
+    }
+
+    LOG.debug(len(ret["input_ids"]))
+    return ret
+
+
+def wrap_pretraining_dataset(
+    dataset,
+    tokenizer,
+    cfg,
+    ds_wrapper_fn,
+    max_tokens=2048,
+    batch_size=1,
+    seed=42,
+    buffer_size=10_000,
+):
+    if cfg.sample_packing:
+        collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
+            tokenizer,
+            return_tensors="pt",
+            padding=True,
+            pad_to_multiple_of=max_tokens * batch_size,
+        )
+        encode = functools.partial(
+            encode_packed_pretraining,
+            collate_fn,
+            ds_wrapper_fn,
+            max_seq_length=max_tokens,
+            batch_size=batch_size,
+        )
+        # set this to 1 so downstream data_loader doesn't try to increase the batch again
+        cfg.micro_batch_size = 1
+    else:
+        encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
+
+    dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+    dataset = dataset.map(
+        encode,
+        batched=True,
+        batch_size=buffer_size,
+        # input_columns="text",
+        # remove all the existing columns after mapping since they end up having
+        # a different length than the encoded/tokenized column
+        remove_columns=dataset.features.keys(),
+    )
+    return dataset
+
+
+def encode_packed_pretraining(
+    collate_fn,
+    ds_wrapper: Callable,
+    examples: Dict[str, List],
+    max_seq_length: int = 2048,
+    batch_size: int = 4,
+) -> Dict[str, List]:
+    # pylint: disable=duplicate-code
+    # tokenize all the examples
+    # rows get split with stride (overlap)
+    train_dataset = ds_wrapper(Dataset.from_dict(examples))[0]
+
+    train_dataset = process_pretraining_datasets_for_packing(
+        train_dataset, max_seq_length
+    )
+
+    sampler = MultipackBatchSampler(
+        RandomSampler(train_dataset),
+        batch_size=1,
+        drop_last=True,
+        batch_max_len=batch_size * max_seq_length,
+        lengths=get_dataset_lengths(train_dataset),
+    )
+
+    chunked_data = defaultdict(list)
+
+    for batch in sampler:
+        for data in batch:
+            features = train_dataset[data]
+            if "num_truncated_tokens" in features:
+                del features["num_truncated_tokens"]
+            if "num_truncated_tokens" in features:
+                del features["num_truncated_tokens"]
+            if "overflow_to_sample_mapping" in features:
+                del features["overflow_to_sample_mapping"]
+            if "labels" not in features:
+                features["labels"] = features["input_ids"].copy()
+            collated_features = collate_fn(features)
+
+            for feature in features.keys():
+                if feature == "length":
+                    continue
+                chunked_data[feature].append(collated_features[feature].squeeze(0))
+
+    return chunked_data
+
+
+def _get_path(ds_hash, cfg):
+    prepared_ds_path = (
+        Path(cfg.dataset_prepared_path) / ds_hash
+        if cfg.dataset_prepared_path
+        else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
+    )
+
+    return prepared_ds_path
+
+
+def _load_preprocessed_ds(cfg, sub_cfg):
+    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
+    prepared_ds_path = _get_path(ds_hash, cfg)
+    dataset = None
+
+    if (
+        cfg.dataset_prepared_path
+        and any(prepared_ds_path.glob("*"))
+        and not cfg.is_preprocess
+    ):
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        dataset = load_from_disk(str(prepared_ds_path))
+
+    return dataset
+
+
+def _save_preprocessed_ds(cfg, sub_cfg, dataset):
+    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
+    prepared_ds_path = _get_path(ds_hash, cfg)
+
+    if cfg.is_preprocess and is_main_process():
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        dataset.save_to_disk(str(prepared_ds_path))
+
+
+def load_prepare_dpo_datasets(cfg):
+    def load_split(dataset_cfgs, _cfg):
+        split_datasets: List[Any] = []
+        for i, ds_cfg in enumerate(dataset_cfgs):
+            if ds_cfg["ds_type"] == "json":
+                for data_file in ds_cfg["data_files"]:
+                    data_files = {ds_cfg["split"]: data_file}
+                    ds = load_dataset(  # pylint: disable=invalid-name
+                        "json",
+                        data_files=data_files,
+                        split=ds_cfg["split"],
+                    )
+                    split_datasets.insert(i, ds)
+            else:
+                ds = load_dataset(  # pylint: disable=invalid-name
+                    ds_cfg["path"],
+                    split=ds_cfg["split"],
+                )
+                split_datasets.insert(i, ds)
+
+        for i, data_set in enumerate(split_datasets):
+            _type = dataset_cfgs[i]["type"]
+            if _type:
+                if isinstance(_type, DictDefault):
+                    _type = "user_defined.default"
+                ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
+                split_datasets[i] = data_set.map(
+                    ds_transform_fn,
+                    desc="Mapping RL Dataset",
+                )
+            else:
+                # If no `type` is provided, assume the dataset is already in the expected format with
+                # "prompt", "chosen" and "rejected" already preprocessed
+                split_datasets[i] = data_set
+
+        return concatenate_datasets(split_datasets)
+
+    with zero_first(is_main_process()):
+        train_is_preprocessed = False
+        eval_is_preprocessed = False
+        if train_dataset := _load_preprocessed_ds(cfg, cfg.datasets):
+            train_is_preprocessed = True
+        else:
+            train_dataset = load_split(cfg.datasets, cfg)
+
+        eval_dataset = None
+        if cfg.test_datasets:
+            if eval_dataset := _load_preprocessed_ds(cfg, cfg.test_datasets):
+                eval_is_preprocessed = True
+            else:
+                eval_dataset = load_split(cfg.test_datasets, cfg)
+        if not eval_dataset:
+            eval_dataset = None
+
+        if not train_is_preprocessed:
+            _save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
+        if eval_dataset and not eval_is_preprocessed:
+            _save_preprocessed_ds(cfg, cfg.test_datasets, eval_dataset)
+
+    return train_dataset, eval_dataset
--- a/src/axolotl/utils/data/init.py
+++ b/src/axolotl/utils/data/init.py
@@ -1,15 +0,0 @@
-"""
-Data processing modules
-"""
-from axolotl.utils.data.dpo import load_prepare_dpo_datasets  # noqa: F401
-from axolotl.utils.data.pretraining import (  # noqa: F401
-    encode_pretraining,
-    wrap_pretraining_dataset,
-)
-from axolotl.utils.data.sft import (  # noqa: F401
-    get_dataset_wrapper,
-    load_prepare_datasets,
-    load_tokenized_prepared_datasets,
-    prepare_dataset,
-)
-from axolotl.utils.data.utils import md5  # noqa: F401
--- a/src/axolotl/utils/data/dpo.py
+++ b/src/axolotl/utils/data/dpo.py
@@ -1,114 +0,0 @@
-"""data handling specific to DPO"""
-
-import logging
-from pathlib import Path
-from typing import Any, List
-
-import yaml
-from datasets import concatenate_datasets, load_dataset, load_from_disk
-
-from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
-from axolotl.prompt_strategies.dpo import load as load_dpo
-from axolotl.utils.data.utils import md5
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_main_process, zero_first
-
-LOG = logging.getLogger("axolotl")
-
-
-def _get_path(ds_hash, cfg):
-    prepared_ds_path = (
-        Path(cfg.dataset_prepared_path) / ds_hash
-        if cfg.dataset_prepared_path
-        else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
-    )
-
-    return prepared_ds_path
-
-
-def _load_preprocessed_ds(cfg, sub_cfg):
-    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
-    prepared_ds_path = _get_path(ds_hash, cfg)
-    dataset = None
-
-    # pylint: disable=duplicate-code
-    if (
-        cfg.dataset_prepared_path
-        and any(prepared_ds_path.glob("*"))
-        and not cfg.is_preprocess
-    ):
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
-        dataset = load_from_disk(str(prepared_ds_path))
-
-    return dataset
-
-
-def _save_preprocessed_ds(cfg, sub_cfg, dataset):
-    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
-    prepared_ds_path = _get_path(ds_hash, cfg)
-
-    if cfg.is_preprocess and is_main_process():
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
-        dataset.save_to_disk(str(prepared_ds_path))
-
-
-def load_prepare_dpo_datasets(cfg):
-    def load_split(dataset_cfgs, _cfg):
-        split_datasets: List[Any] = []
-        for i, ds_cfg in enumerate(dataset_cfgs):
-            if ds_cfg["ds_type"] == "json":
-                for data_file in ds_cfg["data_files"]:
-                    data_files = {ds_cfg["split"]: data_file}
-                    ds = load_dataset(  # pylint: disable=invalid-name
-                        "json",
-                        data_files=data_files,
-                        split=ds_cfg["split"],
-                    )
-                    split_datasets.insert(i, ds)
-            else:
-                ds = load_dataset(  # pylint: disable=invalid-name
-                    ds_cfg["path"],
-                    split=ds_cfg["split"],
-                )
-                split_datasets.insert(i, ds)
-
-        for i, data_set in enumerate(split_datasets):
-            _type = dataset_cfgs[i]["type"]
-            if _type:
-                if isinstance(_type, DictDefault):
-                    _type = "user_defined.default"
-                ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
-                split_datasets[i] = data_set.map(
-                    ds_transform_fn,
-                    desc="Mapping RL Dataset",
-                )
-            else:
-                # If no `type` is provided, assume the dataset is already in the expected format with
-                # "prompt", "chosen" and "rejected" already preprocessed
-                split_datasets[i] = data_set
-
-        return concatenate_datasets(split_datasets)
-
-    with zero_first(is_main_process()):
-        train_is_preprocessed = False
-        eval_is_preprocessed = False
-        if train_dataset := _load_preprocessed_ds(cfg, cfg.datasets):
-            train_is_preprocessed = True
-        else:
-            train_dataset = load_split(cfg.datasets, cfg)
-
-        eval_dataset = None
-        if cfg.test_datasets:
-            if eval_dataset := _load_preprocessed_ds(cfg, cfg.test_datasets):
-                eval_is_preprocessed = True
-            else:
-                eval_dataset = load_split(cfg.test_datasets, cfg)
-        if not eval_dataset:
-            eval_dataset = None
-
-        if not train_is_preprocessed:
-            _save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
-        if eval_dataset and not eval_is_preprocessed:
-            _save_preprocessed_ds(cfg, cfg.test_datasets, eval_dataset)
-
-    return train_dataset, eval_dataset
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -1,232 +0,0 @@
-"""data handling specific to pretraining"""
-
-import functools
-import logging
-from collections import defaultdict
-from typing import Callable, Dict, List, Optional
-
-import torch
-from datasets import Dataset
-from torch.utils.data import RandomSampler
-from transformers import PreTrainedTokenizerBase
-
-from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
-from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
-from axolotl.utils.trainer import process_pretraining_datasets_for_packing
-
-LOG = logging.getLogger("axolotl")
-
-
-def encode_pretraining(
-    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
-) -> Dict[str, List]:
-    res = tokenizer(
-        examples,
-        truncation=True,
-        max_length=max_tokens - 2,
-        add_special_tokens=True,
-    )
-    # Convert to PyTorch tensors
-    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
-    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
-    new_input_ids = []
-    new_attention_mask = []
-    # Append EOS and PAD tokens to input_ids, and correct attention_mask
-    for i, _ in enumerate(input_ids):
-        input_ids[i] = torch.cat(
-            (
-                input_ids[i],
-                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
-            ),
-            dim=0,
-        )
-        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
-
-    # Concatenate tokens so that their lengths are less than max_tokens
-    buffer_input_ids = torch.tensor([], dtype=torch.long)
-    buffer_attention_mask = torch.tensor([], dtype=torch.long)
-
-    for ids, mask in zip(input_ids, attention_mask):
-        if buffer_input_ids.numel() == max_tokens:
-            new_input_ids.append(buffer_input_ids)
-            new_attention_mask.append(buffer_attention_mask)
-            buffer_input_ids = torch.tensor([], dtype=torch.long)
-            buffer_attention_mask = torch.tensor([], dtype=torch.long)
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-        else:
-            buffer_input_ids = torch.cat(
-                (
-                    buffer_input_ids,
-                    torch.full(
-                        (max_tokens - buffer_input_ids.numel(),),
-                        tokenizer.pad_token_id,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            buffer_attention_mask = torch.cat(
-                (
-                    buffer_attention_mask,
-                    torch.full(
-                        (max_tokens - buffer_attention_mask.numel(),),
-                        0,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            new_input_ids.append(buffer_input_ids)
-            new_attention_mask.append(buffer_attention_mask)
-            buffer_input_ids = torch.tensor([], dtype=torch.long)
-            buffer_attention_mask = torch.tensor([], dtype=torch.long)
-
-            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
-            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
-
-    if buffer_input_ids.numel() > 0:  # for any leftover tokens
-        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
-            buffer_input_ids = torch.cat(
-                (
-                    buffer_input_ids,
-                    torch.full(
-                        (max_tokens - buffer_input_ids.numel(),),
-                        tokenizer.pad_token_id,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-            buffer_attention_mask = torch.cat(
-                (
-                    buffer_attention_mask,
-                    torch.full(
-                        (max_tokens - buffer_attention_mask.numel(),),
-                        0,
-                        dtype=torch.long,
-                    ),
-                ),
-                dim=0,
-            )
-        new_input_ids.append(buffer_input_ids)
-        new_attention_mask.append(buffer_attention_mask)
-
-    ret = {
-        "input_ids": [seq.tolist() for seq in new_input_ids],
-        "labels": [seq.tolist() for seq in new_input_ids],
-        "attention_mask": [seq.tolist() for seq in new_attention_mask],
-    }
-
-    LOG.debug(len(ret["input_ids"]))
-    return ret
-
-
-def wrap_pretraining_dataset(
-    dataset,
-    tokenizer,
-    cfg,
-    ds_wrapper_fn,
-    max_tokens=2048,
-    batch_size=1,
-    seed=42,
-    buffer_size=10_000,
-):
-    if cfg.sample_packing:
-        collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
-            tokenizer,
-            return_tensors="pt",
-            padding=True,
-            pad_to_multiple_of=max_tokens * batch_size,
-            multipack_attn=cfg.pretrain_multipack_attn,
-        )
-        encode = functools.partial(
-            encode_packed_pretraining,
-            collate_fn,
-            ds_wrapper_fn,
-            max_seq_length=max_tokens,
-            batch_size=batch_size,
-            multipack_attn=cfg.pretrain_multipack_attn,
-        )
-        # set this to 1 so downstream data_loader doesn't try to increase the batch again
-        cfg.micro_batch_size = 1
-    else:
-        encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
-
-    if cfg.shuffle_merged_datasets:
-        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
-    else:
-        LOG.debug("NOT shuffling merged pretraining datasets")
-
-    # remove all the existing columns after mapping since they end up having
-    # a different length than the encoded/tokenized column
-    # this is empty during streaming/pretraining
-    remove_columns = []
-    if dataset.features is None:
-        for first_row in dataset:
-            remove_columns = first_row.keys()
-            break
-    else:
-        remove_columns = dataset.features.keys()
-
-    dataset = dataset.map(
-        encode,
-        batched=True,
-        batch_size=buffer_size,
-        # input_columns="text",
-        remove_columns=remove_columns,
-    )
-    return dataset
-
-
-def encode_packed_pretraining(
-    collate_fn,
-    ds_wrapper: Callable,
-    examples: Dict[str, List],
-    max_seq_length: int = 2048,
-    batch_size: int = 4,
-    multipack_attn: Optional[bool] = False,
-) -> Dict[str, List]:
-    # pylint: disable=duplicate-code
-    # tokenize all the examples
-    # rows get split with stride (overlap)
-    train_dataset = ds_wrapper(Dataset.from_dict(examples))[0]
-
-    train_dataset = process_pretraining_datasets_for_packing(
-        train_dataset,
-        max_seq_length,
-        skip_position_ids=not multipack_attn,
-    )
-
-    sampler = MultipackBatchSampler(
-        RandomSampler(train_dataset),
-        batch_size=1,
-        drop_last=True,
-        batch_max_len=batch_size * max_seq_length,
-        lengths=get_dataset_lengths(train_dataset),
-    )
-
-    chunked_data = defaultdict(list)
-
-    for batch in sampler:
-        for data in batch:
-            features = train_dataset[data]
-            if "num_truncated_tokens" in features:
-                del features["num_truncated_tokens"]
-            if "num_truncated_tokens" in features:
-                del features["num_truncated_tokens"]
-            if "overflow_to_sample_mapping" in features:
-                del features["overflow_to_sample_mapping"]
-            if "labels" not in features:
-                features["labels"] = features["input_ids"].copy()
-            collated_features = collate_fn(features)
-
-            for feature in features.keys():
-                if feature == "length":
-                    continue
-                chunked_data[feature].append(collated_features[feature].squeeze(0))
-
-    return chunked_data
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -1,10 +0,0 @@
-"""data handling helpers"""
-
-import hashlib
-
-
-def md5(to_hash: str, encoding: str = "utf-8") -> str:
-    try:
-        return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
-    except TypeError:
-        return hashlib.md5(to_hash.encode(encoding)).hexdigest()  # nosec
--- a/src/axolotl/utils/freeze.py
+++ b/src/axolotl/utils/freeze.py
@@ -3,7 +3,7 @@ module to freeze/unfreeze parameters by name
 """
 import logging
 import re
-from typing import Callable, List, Tuple, Union
+from typing import Callable, List, Tuple

 from axolotl.utils.distributed import is_main_process

@@ -99,7 +99,7 @@ def _invert_ranges(


 def _merge_ranges(
-    given_ranges: List[Tuple[int, Union[int, None]]], layer_size: int
+    given_ranges: List[Tuple[int, int | None]], layer_size: int
 ) -> List[Tuple[int, int]]:
    """
    Merges overlapping ranges and sorts the given ranges.
@@ -194,9 +194,7 @@ class LayerNamePattern:
        """
        return self.name_regex.match(name) is not None

-    def _parse_pattern(
-        self, pattern: str
-    ) -> Tuple[str, Union[Tuple[int, Union[int, None]], None]]:
+    def _parse_pattern(self, pattern: str) -> Tuple[str, Tuple[int, int | None] | None]:
        """
        Extracts the range pattern from the given pattern.

--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -5,14 +5,16 @@ import logging
 import math
 import os
 import types
-from typing import Any, Dict, Optional, Tuple, Union  # noqa: F401
+from typing import Any, Dict, List, Optional, Tuple, Type, Union  # noqa: F401

 import addict
 import bitsandbytes as bnb
+import safetensors
 import torch
 import transformers
 from accelerate import init_empty_weights
-from bitsandbytes.nn import Params4bit
+from bitsandbytes.nn import Linear4bit, Params4bit
+from fastcore.parallel import parallel
 from peft import (
    LoftQConfig,
    PeftConfig,
@@ -21,7 +23,7 @@ from peft import (
    prepare_model_for_kbit_training,
 )
 from peft.tuners.lora import QuantLinear
-from torch import nn
+from torch import Tensor, nn
 from transformers import (  # noqa: F401
    AddedToken,
    AutoConfig,
@@ -33,7 +35,9 @@ from transformers import (  # noqa: F401
    PreTrainedTokenizerBase,
 )
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, hub

+from axolotl.core.policies.auto_wrap import SUPPORTED_AUTO_WRAP_MODEL_TYPES
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
@@ -43,7 +47,6 @@ from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.chat_templates import chat_templates
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import zero_only
 from axolotl.utils.lora_embeddings import get_linear_embedding_layers

 LOG = logging.getLogger("axolotl")
@@ -135,8 +138,9 @@ def load_tokenizer(cfg):
    if cfg.tokenizer_type:
        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)

+    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
    tokenizer = tokenizer_cls.from_pretrained(
-        cfg.tokenizer_config,
+        tokenizer_config,
        trust_remote_code=cfg.trust_remote_code or False,
        use_fast=use_fast,
        **tokenizer_kwargs,
@@ -248,11 +252,10 @@ def load_tokenizer(cfg):
            {"additional_special_tokens": additional_special_tokens}
        )

-    with zero_only():
-        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-        LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")

    if cfg.chat_template:
        chat_template_string = chat_templates(cfg.chat_template)
@@ -269,6 +272,117 @@ def load_tokenizer(cfg):
    return tokenizer


+def replace_linear(
+    model: nn.Module,
+    linear_replacement: Type[nn.Module],
+    quant_config: Union[dict, None] = None,
+    skip_modules=None,
+    **kwargs,
+):
+    """
+    Replace linear modules with a new Linear module.
+    Parameters:
+        model (`torch.nn.Module`):
+            Input model or `torch.nn.Module` as the function is run recursively.
+        linear_replacement (`torch.nn.Module`):
+            The linear module that replaces the old one. Only expects standard arguments.
+            If other arguments need to be passed, use a lambda.
+        skip_modules (`List[str]`, *optional*, defaults to `lm_head`):
+            List of modules names not to convert. Defaults to `lm_head`.
+    """
+    if skip_modules is None:
+        skip_modules = ["lm_head"]
+    for name, module in model.named_children():
+        if len(list(module.children())) > 0:
+            replace_linear(
+                module, linear_replacement, quant_config, skip_modules, **kwargs
+            )
+
+        if isinstance(module, torch.nn.Linear) and name not in skip_modules:
+            if issubclass(linear_replacement, Linear4bit):
+                model._modules[  # pylint: disable=protected-access
+                    name
+                ] = linear_replacement(
+                    module.in_features,
+                    module.out_features,
+                    module.bias is not None,
+                    **kwargs,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported linear replacement: {type(linear_replacement)}"
+                )
+    return model
+
+
+def load_and_quantize(
+    module: nn.Module,
+    name: str,
+    value: Tensor,
+    device: torch.device = None,
+    dtype: torch.dtype = None,
+    skip_names: Optional[List[str]] = None,
+    is_meta_rank: bool = False,
+    low_memory: bool = True,
+    verbose: bool = False,
+    quant_method: str = "bnb",
+):
+    """
+    Loads `value` tensor into submodule of `module`, optionally skipping `skip_names` and converting to `dtype`.
+
+    Quantizes `Params4bit` on `device` then places on "cpu" if low_memory=True or "meta" if is_meta_rank=True.
+    """
+
+    if skip_names is None:
+        skip_names = []
+
+    def place_on_device(value):
+        if is_meta_rank:
+            device = "meta"
+        elif low_memory:
+            device = "cpu"
+        else:
+            device = "cuda"
+        return value.to(device=device, dtype=dtype)
+
+    if any(skip_name in name for skip_name in skip_names):
+        if verbose:
+            print(f"Skipping {name} because it is in skip_names")
+        return
+
+    module_key, _, value_key = name.rpartition(".")
+    try:
+        submodule = module.get_submodule(module_key)
+    except AttributeError as exc:
+        print(f"Module {module_key} not found:\n{exc}")
+        return
+
+    try:
+        if quant_method == "bnb":
+            param = submodule.get_parameter(value_key)
+            if isinstance(param, Params4bit):
+                # With `sync_module_states=True`, a meta device Params4bit needs to be the same
+                # shape as the quantized Params4bit with an initialized quant_state. However,
+                # FSDP only syncs parameters and buffers, so the quant_state isn't copied. This
+                # workaround quantizes Params4bit to initialize quant_state on all ranks, then
+                # replaces Params4bit's data with a meta tensor to free memory on non-rank 0.
+                value = type(param)(
+                    value.to(device=device, dtype=dtype).data, **param.__dict__
+                ).cuda(device)
+                if is_meta_rank:
+                    value = type(param)(value.data.to("meta"), **value.__dict__)
+                elif low_memory:
+                    value = type(param)(value.data.to("cpu"), **value.__dict__)
+            else:
+                value = type(param)(place_on_device(value).data)
+
+    except AttributeError:
+        # it's a buffer
+        value = place_on_device(value)
+
+    setattr(submodule, value_key, value)
+
+
 def load_model(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizerBase,
@@ -404,9 +518,7 @@ def load_model(
        from accelerate import infer_auto_device_map

        with init_empty_weights():
-            model_canvas = AutoModelForCausalLM.from_config(
-                model_config, trust_remote_code=cfg.trust_remote_code or False
-            )
+            model_canvas = AutoModelForCausalLM.from_config(model_config)
        model_canvas.tie_weights()
        device_map = infer_auto_device_map(
            model_canvas,
@@ -437,7 +549,6 @@ def load_model(

    if cfg.revision_of_model:
        model_kwargs["revision"] = cfg.revision_of_model
-
    if cfg.gptq:
        if not hasattr(model_config, "quantization_config"):
            LOG.warning("model config does not contain quantization_config information")
@@ -457,12 +568,7 @@ def load_model(
            "bnb_4bit_compute_dtype": cfg.torch_dtype,
            "bnb_4bit_use_double_quant": True,
            "bnb_4bit_quant_type": "nf4",
-            "bnb_4bit_quant_storage": torch.bfloat16,
        }
-        if not cfg.deepspeed:
-            # for some reason, this causes the loss to be off by an order of magnitude
-            # but deepspeed needs this still in bfloat16
-            bnb_config["bnb_4bit_quant_storage"] = torch.float32

        if cfg.bnb_config_kwargs:
            bnb_config.update(cfg.bnb_config_kwargs)
@@ -511,13 +617,78 @@ def load_model(
        model_kwargs["attn_implementation"] = "eager"
        model_config._attn_implementation = "eager"  # pylint: disable=protected-access

-    if cfg.low_cpu_mem_usage:
-        model_kwargs["low_cpu_mem_usage"] = True
-
-    qlora_fsdp = cfg.fsdp and cfg.adapter == "qlora"
+    qlora_fsdp = (
+        cfg.fsdp
+        and cfg.adapter == "qlora"
+        and model_config.model_type in SUPPORTED_AUTO_WRAP_MODEL_TYPES
+    )

    try:
-        if (
+        if qlora_fsdp:
+            if cfg.bf16 or cfg.bfloat16:
+                torch_dtype, compute_dtype = torch.float32, torch.bfloat16
+            elif cfg.fp16 or cfg.float16:
+                torch_dtype, compute_dtype = torch.float32, torch.float16
+            else:
+                torch_dtype, compute_dtype = torch.float32, torch.float16
+
+            with init_empty_weights():
+                LOG.info("Loading model with empty weights.")
+                model = AutoModelForCausalLM.from_config(model_config)
+                model.model = replace_linear(
+                    model.model,
+                    Linear4bit,
+                    compute_dtype=compute_dtype,
+                    quant_type="nf4",
+                    quant_storage=torch_dtype,
+                )
+
+            model.is_loaded_in_4bit = True
+
+            # Grab the safetensors files that hold the weights
+            try:
+                idx = hub.cached_file(base_model, SAFE_WEIGHTS_INDEX_NAME)
+                files, _ = hub.get_checkpoint_shard_files(base_model, idx)
+            except OSError:
+                try:
+                    # This means the model doesn't have a model.safetensors.index.json because it is not sharded
+                    files = []
+                    files.append(hub.cached_file(base_model, SAFE_WEIGHTS_NAME))
+                except OSError as exc:
+                    # This means the model probably doesn't have a safetensors file
+                    raise exc
+
+            # Load in the weights, using our custom load_and_quantize method which quantizes Params4bit on the fly
+            # and then places each layer on CPU or meta if using low_memory to minimize GPU memory usage
+            def load_and_quantize_parallel(name_param, model, **kwargs):
+                name, param = name_param
+                load_and_quantize(model, name, param, **kwargs)
+
+            param_count = sum((p.numel() for n, p in model.named_parameters()))
+            for filename in files:
+                weights = safetensors.torch.load_file(filename)
+                quant_method = "bnb"
+                devprops = torch.cuda.get_device_properties(torch.cuda.current_device())
+                left = int(os.cpu_count() / torch.cuda.device_count())
+                right = int(
+                    8 * (devprops.total_memory / 1e9 / 40) * (70 / (param_count / 1e9))
+                )
+                n_workers = min(left, right)
+                parallel(
+                    load_and_quantize_parallel,
+                    weights.items(),
+                    n_workers=n_workers,
+                    threadpool=True,
+                    model=model,
+                    dtype=torch_dtype,
+                    device=cfg.local_rank,
+                    skip_names=[],
+                    is_meta_rank=(cfg.local_rank != 0),
+                    verbose=False,
+                    quant_method=quant_method,
+                )
+
+        elif (
            model_config.model_type == "llama"
            and not cfg.trust_remote_code
            and not cfg.gptq
@@ -544,6 +715,27 @@ def load_model(
                if cfg.flash_attn_fuse_qkv:
                    LOG.info("patching with fused QKV")
                    replace_llama_qkv_with_fused(model)
+        elif (
+            model_config.model_type == "mixtral"
+            and not cfg.adapter
+            and cfg.fuse_moe
+        ):
+            from axolotl.monkeypatch.utils import set_module_name
+            from axolotl.monkeypatch.moe.moe import SparseMoeBlock
+            from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
+
+            for name, module in model.named_modules():
+                if isinstance(module, MixtralSparseMoeBlock):
+                    smoe = SparseMoeBlock(
+                        experts=module.experts,
+                        gate=module.gate,
+                        hidden_dim=module.hidden_dim,
+                        ffn_dim=module.ffn_dim,
+                        num_experts=module.num_experts,
+                        top_k=module.top_k,
+                    )
+                    set_module_name(model, name, smoe)
+
        elif model_type == "MambaLMHeadModel":
            # FIXME this is janky at best and hacked together to make it work
            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
@@ -691,9 +883,7 @@ def load_model(

    if cfg.adapter in ["lora", "qlora"]:
        if cfg.gradient_checkpointing:
-            model.gradient_checkpointing_enable(
-                gradient_checkpointing_kwargs=cfg.gradient_checkpointing_kwargs
-            )
+            model.gradient_checkpointing_enable()
        if (
            cfg.load_in_8bit or cfg.load_in_4bit
        ) and not skip_prepare_model_for_kbit_training:
@@ -861,9 +1051,7 @@ def load_lora(model, cfg, inference=False, config_only=False):
    if cfg.peft_use_dora:
        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
    if cfg.peft_use_rslora:
-        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
-    if cfg.peft_layer_replication:
-        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
+        lora_config_kwargs["use_rslora"] = cfg.use_rslora

    lora_config = LoraConfig(
        r=cfg.lora_r,
@@ -902,12 +1090,7 @@ def load_lora(model, cfg, inference=False, config_only=False):
        model = get_peft_model(model, lora_config)

    if rank == 0:
-        try:
-            model.print_trainable_parameters()
-        except AttributeError as exc:
-            LOG.warning(
-                "Exception caught during model.print_trainable_parameters(): %s", exc
-            )
+        model.print_trainable_parameters()
    elif cfg.fsdp and cfg.adapter == "qlora":
        setup_quantized_peft_meta_for_training(model)

--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -11,7 +11,6 @@ import torch.cuda
 from accelerate.logging import get_logger
 from datasets import set_caching_enabled
 from torch.utils.data import DataLoader, RandomSampler
-from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFDPOTrainerBuilder
 from axolotl.utils.distributed import is_main_process, reduce_and_broadcast, zero_first
@@ -125,10 +124,9 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
                eval_dataset = eval_dataset.remove_columns("attention_mask")

        if cfg.model_config_type == "falcon":
-            LOG.info("dropping token_type_ids column if it exists")
-            if "token_type_ids" in train_dataset.column_names:
-                train_dataset = train_dataset.remove_columns("token_type_ids")
-            if eval_dataset and "token_type_ids" in eval_dataset.column_names:
+            LOG.info("dropping token_type_ids column")
+            train_dataset = train_dataset.remove_columns("token_type_ids")
+            if eval_dataset:
                eval_dataset = eval_dataset.remove_columns("token_type_ids")

        train_dataset = train_dataset.filter(
@@ -172,21 +170,17 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
    return train_dataset, eval_dataset


-def process_pretraining_datasets_for_packing(
-    train_dataset, sequence_len, skip_position_ids=True
-):
+def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
    drop_long = partial(drop_long_seq, sequence_len=sequence_len)

    train_dataset = train_dataset.filter(
        drop_long,
        desc="Dropping Long Sequences",
    )
-    if skip_position_ids:
-        train_dataset = train_dataset.map(
-            add_position_ids,
-            desc="Add position_id column (Pretraining Sample Packing)",
-        )
-
+    train_dataset = train_dataset.map(
+        add_position_ids,
+        desc="Add position_id column (Pretraining Sample Packing)",
+    )
    return train_dataset


@@ -198,7 +192,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
            .apply(lambda x: len(x))  # pylint: disable=unnecessary-lambda
            .values
        )
-        LOG.debug(f"total_num_tokens: {total_num_tokens:_}", main_process_only=True)
+        LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True)
        if update:
            cfg.total_num_tokens = total_num_tokens

@@ -212,7 +206,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
            .sum()
        )
        LOG.debug(
-            f"`total_supervised_tokens: {total_supervised_tokens:_}`",
+            f"`total_supervised_tokens: {total_supervised_tokens}`",
            main_process_only=True,
        )
        if update:
@@ -239,7 +233,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                * cfg.num_epochs
            )
            LOG.debug(
-                f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}",
+                f"total_num_tokens: {cfg.total_num_tokens}, total_num_steps: {total_num_steps}",
                main_process_only=True,
            )
        else:
@@ -310,14 +304,8 @@ def setup_fsdp_envs(cfg):
        os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
    if cfg.fsdp_config.fsdp_sync_module_states:
        os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
-    if cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
-        os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true"
-    if cfg.fsdp_config.fsdp_use_orig_params:
-        os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
    if cfg.fsdp_config.fsdp_state_dict_type:
        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
-    if cfg.fsdp_config.fsdp_auto_wrap_policy:
-        os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.fsdp_auto_wrap_policy
    if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
        os.environ[
            "FSDP_TRANSFORMER_CLS_TO_WRAP"
@@ -331,11 +319,6 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
        os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed

-    if (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True:
-        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
-    elif cfg.fp16:
-        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
-

 def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
    if cfg.rl in ["dpo", "ipo", "kto_pair"]:
--- a/styles.css
+++ b/styles.css
@@ -1 +0,0 @@
-/* css styles */
--- a/tests/core/test_trainer_builder.py
+++ b/tests/core/test_trainer_builder.py
@@ -1,18 +1,16 @@
 """
 unit tests for axolotl.core.trainer_builder
 """
-
 import pytest

 from axolotl.core.trainer_builder import HFDPOTrainerBuilder
-from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer


@pytest.fixture(name="cfg")
 def fixture_cfg():
-    cfg = DictDefault(
+    return DictDefault(
        {
            "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
            "model_type": "AutoModelForCausalLM",
@@ -36,10 +34,6 @@ def fixture_cfg():
        }
    )

-    normalize_config(cfg)
-
-    return cfg
-

@pytest.fixture(name="tokenizer")
 def fixture_tokenizer(cfg):
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -77,7 +77,7 @@ class TestMixtral(unittest.TestCase):
        model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
-            == torch.float32
+            == torch.uint8
        )
        assert (Path(temp_dir) / "adapter_model.bin").exists()

@@ -131,7 +131,7 @@ class TestMixtral(unittest.TestCase):
        model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
-            == torch.float32
+            == torch.uint8
        )
        assert (Path(temp_dir) / "adapter_model.bin").exists()

--- a/tests/monkeypatch/test_moe.py
+++ b/tests/monkeypatch/test_moe.py
@@ -0,0 +1,60 @@
+import torch
+import pytest
+from torch import nn
+from torch.nn import functional as F
+from axolotl.monkeypatch.moe.mlp import FusedExperts
+from axolotl.monkeypatch.moe.moe import SparseMoeBlock
+
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock, MixtralConfig
+
+def test_fused_mixtral_moe():
+    # NOTE: Requires torch 2.2.0
+    # Set random seeds for reproducibility
+    torch.set_default_dtype(torch.float16)
+    torch.set_default_device("cuda")
+    torch.manual_seed(0)
+
+    # Define the configuration for the MixtralSparseMoeBlock
+    config = MixtralConfig(
+        hidden_size=128,
+        intermediate_size=512,
+        num_local_experts=8,
+        num_experts_per_tok=2,
+    )
+
+    # Initialize the MixtralSparseMoeBlock and SparseMoeBlock with the same configuration
+    mixtral_moe = MixtralSparseMoeBlock(config)
+    sparse_moe = SparseMoeBlock(
+        experts=mixtral_moe.experts,
+        gate=mixtral_moe.gate,
+        hidden_dim=config.hidden_size,
+        ffn_dim=config.intermediate_size,
+        num_experts=config.num_local_experts,
+        top_k=config.num_experts_per_tok
+    )
+
+    assert torch.cat([
+        mixtral_moe.experts[0].w1.weight.data,
+        mixtral_moe.experts[0].w3.weight.data], dim=0
+    ).equal(sparse_moe.experts.experts.weight[0])
+
+    # Generate random input data
+    batch_size = 16
+    sequence_length = 32
+    input_data = torch.randn(batch_size, sequence_length, config.hidden_size)
+
+    # Run the forward pass with gradients for both models
+    with torch.no_grad():
+        mixtral_output, mixtral_router_logits = mixtral_moe(input_data)
+        sparse_output, sparse_router_logits = sparse_moe(input_data)
+
+    # Compute the difference between the outputs
+    output_diff = torch.abs(mixtral_output - sparse_output).mean().item()
+    router_diff = torch.abs(mixtral_router_logits - sparse_router_logits).mean().item()
+
+    # Define the tolerance for the difference
+    tolerance = 0.05
+
+    # # Check if the difference is within the tolerance
+    assert output_diff < 0.05, f"Output difference is {output_diff}, which is greater than the tolerance of {tolerance}"
+    assert router_diff == 0, f"Output difference is {output_diff}, which is greater than the tolerance of {tolerance}"
--- a/tests/prompt_strategies/test_sharegpt.py
+++ b/tests/prompt_strategies/test_sharegpt.py
@@ -62,38 +62,6 @@ def fixture_sharegpt_glaive_dataset():
    )


-@pytest.fixture(name="multi_role_dataset")
-def fixture_multi_role_dataset():
-    return Dataset.from_list(
-        [
-            {
-                "conversations": [
-                    {
-                        "from": "system",
-                        "value": "use get_weather(city) to get the weather for a city",
-                    },
-                    {
-                        "from": "human",
-                        "value": "hello, what's the weather in New York?",
-                    },
-                    {
-                        "from": "gpt",
-                        "value": "let me get that for you",
-                    },
-                    {
-                        "from": "tool",
-                        "value": "get_weather(New York)",
-                    },
-                    {
-                        "from": "gpt",
-                        "value": "the weather in New York is 70 degrees and sunny",
-                    },
-                ]
-            }
-        ]
-    )
-
-
@pytest.fixture(name="tokenizer")
 def fixture_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
@@ -228,39 +196,3 @@ class TestSharegpt:
            32001, 13892, 13, 28737, 28742, 28719, 7371, 28725, 562, 315, 949, 28742, 28707, 506, 272, 21368, 298, 1820, 22447, 28723, 28705, 523, 28766, 416, 1009, 772, 28766, 28767, 32000, 28705, 13  # gpt
        ]
        # fmt: on
-
-    def test_multi_role_dataset(self, multi_role_dataset, tokenizer):
-        strategy = SimpleShareGPTPromptTokenizingStrategy(
-            ShareGPTPrompterV2(conversation="chatml", roles={"input": ["tool"]}),
-            tokenizer,
-            False,  # train_on_inputs
-            2048,  # sequence_len
-        )
-
-        dataset_wrapper = TokenizedPromptDataset(
-            strategy, multi_role_dataset, process_count=1
-        )
-
-        input_ids = dataset_wrapper[0]["input_ids"]
-        # fmt: off
-        assert input_ids == [
-            1,   # bos
-            32001, 1587, 13, 1730, 625, 28730, 769, 1223, 28732, 18373, 28731, 298, 625, 272, 8086, 354, 264, 2990, 32000, 28705, 13,  # system
-            32001, 2188, 13, 21558, 28725, 767, 28742, 28713, 272, 8086, 297, 1450, 2726, 28804, 32000, 28705, 13,  # human
-            32001, 13892, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13,  # gpt
-            32001, 3921, 13, 527, 28730, 769, 1223, 28732, 2972, 2726, 28731, 32000, 28705, 13,  # tool
-            32001, 13892, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13  # gpt
-        ]
-        # fmt: on
-
-        labels = dataset_wrapper[0]["labels"]
-        # fmt: off
-        assert labels == [
-            -100,  # bos
-            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # system
-            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # human
-            -100, -100, 13, 895, 528, 625, 369, 354, 368, 32000, 28705, 13,  # gpt
-            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # tool
-            -100, -100, 13, 1237, 8086, 297, 1450, 2726, 349, 28705, 28787, 28734, 11182, 304, 4376, 1780, 32000, 28705, 13  # gpt
-        ]
-        # fmt: on
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,272 +0,0 @@
-"""
-Test dataset loading under various conditions.
-"""
-
-import shutil
-import tempfile
-import unittest
-from pathlib import Path
-
-from datasets import Dataset
-from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-from axolotl.utils.data import load_tokenized_prepared_datasets
-from axolotl.utils.dict import DictDefault
-
-
-class TestDatasetPreparation(unittest.TestCase):
-    """Test a configured dataloader."""
-
-    def setUp(self) -> None:
-        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
-        self.tokenizer.add_special_tokens(
-            {
-                "bos_token": "<s>",
-                "eos_token": "</s>",
-                "unk_token": "<unk>",
-            }
-        )
-        # Alpaca dataset.
-        self.dataset = Dataset.from_list(
-            [
-                {
-                    "instruction": "Evaluate this sentence for spelling and grammar mistakes",
-                    "input": "He finnished his meal and left the resturant",
-                    "output": "He finished his meal and left the restaurant.",
-                }
-            ]
-        )
-
-    def test_load_hub(self):
-        """Core use case.  Verify that processing data from the hub works"""
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            prepared_path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 1024,
-                    "datasets": [
-                        {
-                            "path": "mhenrichsen/alpaca_2k_test",
-                            "type": "alpaca",
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 2000
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-
-    def test_load_local_hub(self):
-        """Niche use case.  Verify that a local copy of a hub dataset can be loaded"""
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
-            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
-                repo_id="mhenrichsen/alpaca_2k_test",
-                repo_type="dataset",
-                local_dir=tmp_ds_path,
-            )
-
-            prepared_path = Path(tmp_dir) / "prepared"
-            # Right now a local copy that doesn't fully conform to a dataset
-            # must list data_files and ds_type otherwise the loader won't know
-            # how to load it.
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 1024,
-                    "datasets": [
-                        {
-                            "path": "mhenrichsen/alpaca_2k_test",
-                            "ds_type": "parquet",
-                            "type": "alpaca",
-                            "data_files": [
-                                "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
-                            ],
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 2000
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-            shutil.rmtree(tmp_ds_path)
-
-    def test_load_from_save_to_disk(self):
-        """Usual use case.  Verify datasets saved via `save_to_disk` can be loaded."""
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
-            self.dataset.save_to_disk(tmp_ds_name)
-
-            prepared_path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 256,
-                    "datasets": [
-                        {
-                            "path": str(tmp_ds_name),
-                            "type": "alpaca",
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 1
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-
-    def test_load_from_dir_of_parquet(self):
-        """Usual use case.  Verify a directory of parquet files can be loaded."""
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
-            tmp_ds_dir.mkdir()
-            tmp_ds_path = tmp_ds_dir / "shard1.parquet"
-            self.dataset.to_parquet(tmp_ds_path)
-
-            prepared_path: Path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 256,
-                    "datasets": [
-                        {
-                            "path": str(tmp_ds_dir),
-                            "ds_type": "parquet",
-                            "name": "test_data",
-                            "data_files": [
-                                str(tmp_ds_path),
-                            ],
-                            "type": "alpaca",
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 1
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-
-    def test_load_from_dir_of_json(self):
-        """Standard use case.  Verify a directory of json files can be loaded."""
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
-            tmp_ds_dir.mkdir()
-            tmp_ds_path = tmp_ds_dir / "shard1.json"
-            self.dataset.to_json(tmp_ds_path)
-
-            prepared_path: Path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 256,
-                    "datasets": [
-                        {
-                            "path": str(tmp_ds_dir),
-                            "ds_type": "json",
-                            "name": "test_data",
-                            "data_files": [
-                                str(tmp_ds_path),
-                            ],
-                            "type": "alpaca",
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 1
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-
-    def test_load_from_single_parquet(self):
-        """Standard use case.  Verify a single parquet file can be loaded."""
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
-            self.dataset.to_parquet(tmp_ds_path)
-
-            prepared_path: Path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 256,
-                    "datasets": [
-                        {
-                            "path": str(tmp_ds_path),
-                            "name": "test_data",
-                            "type": "alpaca",
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 1
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-
-    def test_load_from_single_json(self):
-        """Standard use case.  Verify a single json file can be loaded."""
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
-            self.dataset.to_json(tmp_ds_path)
-
-            prepared_path: Path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 256,
-                    "datasets": [
-                        {
-                            "path": str(tmp_ds_path),
-                            "name": "test_data",
-                            "type": "alpaca",
-                        },
-                    ],
-                }
-            )
-
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
-
-            assert len(dataset) == 1
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -8,8 +8,7 @@ from pathlib import Path
 from typing import Optional

 import pytest
-from datasets import load_dataset
-from transformers import AddedToken, AutoTokenizer, LlamaTokenizer
+from transformers import AutoTokenizer, LlamaTokenizer

 from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
 from axolotl.prompt_strategies.alpaca_w_system import (
@@ -20,14 +19,12 @@ from axolotl.prompt_strategies.llama2_chat import (
    Llama2ChatPrompter,
    LLama2ChatTokenizingStrategy,
 )
-from axolotl.prompt_strategies.orpo.chat_template import load
 from axolotl.prompt_strategies.sharegpt import GlaiveShareGPTPromptTokenizingStrategy
 from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    ShareGPTPromptTokenizingStrategy,
 )
 from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompterV2
-from axolotl.utils.dict import DictDefault

 LOG = logging.getLogger("axolotl")

@@ -449,57 +446,5 @@ If a question does not make any sense, or is not factually coherent, explain why
        )


-class OrpoTokenizationTest(unittest.TestCase):
-    """test case for the ORPO tokenization"""
-
-    def setUp(self) -> None:
-        # pylint: disable=duplicate-code
-        tokenizer = LlamaTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-        tokenizer.add_special_tokens(
-            {
-                "eos_token": AddedToken(
-                    "<|im_end|>", rstrip=False, lstrip=False, normalized=False
-                )
-            }
-        )
-        tokenizer.add_tokens(
-            [
-                AddedToken(
-                    "<|im_start|>", rstrip=False, lstrip=False, normalized=False
-                ),
-            ]
-        )
-        self.tokenizer = tokenizer
-        self.dataset = load_dataset(
-            "argilla/ultrafeedback-binarized-preferences-cleaned", split="train"
-        ).select([0])
-
-    def test_orpo_integration(self):
-        strat = load(
-            self.tokenizer,
-            DictDefault({"train_on_inputs": False}),
-            DictDefault({"chat_template": "chatml"}),
-        )
-        res = strat.tokenize_prompt(self.dataset[0])
-        assert "rejected_input_ids" in res
-        assert "rejected_labels" in res
-        assert "input_ids" in res
-        assert "labels" in res
-        assert "prompt_attention_mask" in res
-
-        assert len(res["rejected_input_ids"]) == len(res["rejected_labels"])
-        assert len(res["input_ids"]) == len(res["labels"])
-        assert len(res["input_ids"]) == len(res["prompt_attention_mask"])
-
-        assert res["rejected_labels"][0] == -100
-        assert res["rejected_input_ids"][-1] == res["rejected_labels"][-1]
-
-        assert res["labels"][0] == -100
-        assert res["input_ids"][-1] == res["labels"][-1]
-
-        assert res["prompt_attention_mask"][0] == 1
-        assert res["prompt_attention_mask"][-1] == 0
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Casper Hansen	10328b3429	Simplify creating parameters	2024-03-18 12:32:59 +00:00
Casper Hansen	5bfc470d57	Stop transformers from using all memory	2024-03-18 11:47:47 +00:00
Casper Hansen	04168801c9	Simplify conversion + more debug	2024-03-17 20:21:46 +00:00
Casper	d43a79b7bf	device_map auto	2024-03-17 19:52:56 +01:00
Casper	884d81331e	Initialize ParallelExperts on device of first expert	2024-03-17 19:51:31 +01:00
Casper	2ea75b4160	temporary: inference validation script	2024-03-17 19:48:52 +01:00
Casper Hansen	035e680631	Update test	2024-03-15 13:58:12 +00:00
Casper Hansen	26fc10df01	Refactor names, bugfixes	2024-03-15 12:39:11 +00:00
Casper Hansen	1bc008e901	Refactor creating FusedExperts	2024-03-15 11:59:56 +00:00
Casper Hansen	3f7ed6a784	Bugfixes, test green	2024-03-15 11:48:46 +00:00
Casper	feea977923	initial implementation, untested	2024-03-15 11:54:36 +01:00