wip load remote data from postgres

2024-02-12 09:55:24 -05:00
121 changed files with 817 additions and 5972 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yaml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yaml
@@ -59,7 +59,6 @@ body:
      label: Config yaml
      description: |
        Please attach the config yaml!
-      render: yaml

  - type: textarea
    id: possible-solution
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -12,6 +12,11 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: "118"
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.10"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -17,6 +17,6 @@ jobs:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
-          python-version: "3.10"
+          python-version: "3.9"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.0
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,12 +13,16 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.1
+            axolotl_extras:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.1.2
            axolotl_extras:
-            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
            is_latest: true
          - cuda: 121
            cuda_version: 12.1.0
@@ -55,7 +59,6 @@ jobs:
            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -70,6 +73,11 @@ jobs:
    strategy:
      matrix:
        include:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.1
+            axolotl_extras:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,7 +23,7 @@ jobs:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
-          python-version: "3.10"
+          python-version: "3.9"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.0

@@ -33,7 +33,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.10", "3.11"]
+        python_version: ["3.9", "3.10", "3.11"]
    timeout-minutes: 10

    steps:
@@ -58,8 +58,8 @@ jobs:
  docker-e2e-tests:
    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 60
+    runs-on: [self-hosted, gpu, docker]
+    timeout-minutes: 30
    needs: [pre-commit, pytest]

    strategy:
@@ -69,32 +69,44 @@ jobs:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.1.2
-            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
-            num_gpus: 1
+            pytorch: 2.0.1
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
            pytorch: 2.1.2
-            num_gpus: 1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
        with:
-          python-version: "3.10"
-      - name: Install Modal
+          images: winglian/axolotl-tests
+      - name: Build Docker image
        run: |
-          python -m pip install --upgrade pip
-          pip install modal jinja2
-      - name: Update env vars
+          # Set up build arguments
+          BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
+          CUDA="${{ matrix.cuda }}"
+          PYTORCH_VERSION="${{ matrix.pytorch }}"
+          # Build the Docker image
+          docker build . \
+            --file ./docker/Dockerfile-tests \
+            --build-arg BASE_TAG=$BASE_TAG \
+            --build-arg CUDA=$CUDA \
+            --build-arg GITHUB_REF=$GITHUB_REF \
+            --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
+            --tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
+            --no-cache
+      - name: Unit Tests w docker image
        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
+          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
+      - name: GPU Unit Tests w docker image
        run: |
-          modal run cicd.tests
+          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
+      - name: GPU Unit Tests monkeypatched w docker image
+        run: |
+          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
+      - name: Prune image from docker
+        if: github.ref != 'refs/heads/main'
+        run: |
+          docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
--- a/.gitignore
+++ b/.gitignore
@@ -167,8 +167,3 @@ cython_debug/
 # WandB
 # wandb creates a folder to store logs for training runs
 wandb
-
-# Runs
-lora-out/*
-qlora-out/*
-mlruns/*
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -1,5 +1,5 @@
 [mypy]
-plugins = pydantic.mypy
+
 exclude = venv

 [mypy-alpaca_lora_4bit.*]
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,7 +31,6 @@ repos:
      additional_dependencies:
        [
            'types-PyYAML',
-            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
    rev: 1.7.5
--- a/README.md
+++ b/README.md
@@ -13,9 +13,6 @@ Features:
 - Log results and optionally checkpoints to wandb or mlflow
 - And more!

-<a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
-  <img alt="phorm.ai" src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=">
-</a>

 <table>
 <tr>
@@ -25,20 +22,19 @@ Features:
 - [Introduction](#axolotl)
 - [Supported Features](#axolotl-supports)
 - [Quickstart](#quickstart-)
- [Environment](#environment)
+- [Installation](#installation)
  - [Docker](#docker)
  - [Conda/Pip venv](#condapip-venv)
-  - [Cloud GPU](#cloud-gpu) - Latitude.sh, JarvisLabs, RunPod
+  - [Cloud GPU](#cloud-gpu) - Latitude.sh, RunPod
  - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
  - [Windows](#windows)
-  - [Mac](#mac)
  - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
  - [How to Add Custom Prompts](#how-to-add-custom-prompts)
  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
 - [Config](#config)
  - [Train](#train)
-  - [Inference](#inference-playground)
+  - [Inference](#inference)
  - [Merge LORA to Base](#merge-lora-to-base)
  - [Special Tokens](#special-tokens)
 - Advanced Topics
@@ -91,18 +87,17 @@ Features:
 | phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
 | RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
 | Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
-| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |

-✅: supported
-❌: not supported
-❓: untested

 ## Quickstart ⚡

 Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.

-**Requirements**: Python >=3.10 and Pytorch >=2.1.1.
+**Requirements**: Python >=3.9 and Pytorch >=2.0.

+`pip3 install "axolotl[flash-attn,deepspeed] @ git+https://github.com/OpenAccess-AI-Collective/axolotl"`
+
+### For developers
 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl
@@ -132,14 +127,13 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
 accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/examples/openllama-3b/lora.yml
 ```

-## Advanced Setup
+## Installation

 ### Environment

 #### Docker
-
  ```bash
-  docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
+  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
  ```

  Or run on the current files for development:
@@ -158,7 +152,7 @@ accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAcc
  A more powerful Docker command to run would be this:

  ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
  ```

  It additionally:
@@ -173,7 +167,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
  </details>

 #### Conda/Pip venv
-  1. Install python >=**3.10**
+  1. Install python >=**3.9**

  2. Install pytorch stable https://pytorch.org/get-started/locally/

@@ -193,7 +187,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
 For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)

 - on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
- on JarvisLabs.ai use this [direct link](https://jarvislabs.ai/templates/axolotl)
 - on RunPod use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)

 #### Bare Metal Cloud GPU
@@ -207,11 +200,11 @@ For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud
  1. Install python
  ```bash
  sudo apt update
-  sudo apt install -y python3.10
+  sudo apt install -y python3.9

-  sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
-  sudo update-alternatives --config python # pick 3.10 if given option
-  python -V # should be 3.10
+  sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1
+  sudo update-alternatives --config python # pick 3.9 if given option
+  python -V # should be 3.9

  ```

@@ -243,46 +236,21 @@ For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud
  ```
  </details>

-##### GCP
-
-<details>
-
-<summary>Click to Expand</summary>
-
-Use a Deeplearning linux OS with cuda and pytorch installed. Then follow instructions on quickstart.
-
-Make sure to run the below to uninstall xla.
-```bash
-pip uninstall -y torch_xla[tpu]
-```
-
-</details>
-
 #### Windows
 Please use WSL or Docker!

-#### Mac
-
-Use the below instead of the install method in QuickStart.
-```
-pip3 install -e '.'
-```
-More info: [mac.md](/docs/mac.md)

 #### Launching on public clouds via SkyPilot
 To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
-
 ```bash
 pip install "skypilot-nightly[gcp,aws,azure,oci,lambda,kubernetes,ibm,scp]"  # choose your clouds
 sky check
 ```
-
 Get the [example YAMLs](https://github.com/skypilot-org/skypilot/tree/master/llm/axolotl) of using Axolotl to finetune `mistralai/Mistral-7B-v0.1`:
 ```
 git clone https://github.com/skypilot-org/skypilot.git
 cd skypilot/llm/axolotl
 ```
-
 Use one command to launch:
 ```bash
 # On-demand
@@ -292,32 +260,31 @@ HF_TOKEN=xx sky launch axolotl.yaml --env HF_TOKEN
 HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKEN --env BUCKET
 ```

+
 ### Dataset

 Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
 Have dataset(s) in one of the following format (JSONL recommended):

-#### Pretraining
-
- `completion`: raw corpus
-  ```json
-  {"text": "..."}
-  ```
-
-Note: Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
-
-```yaml
-pretraining_dataset: # hf path only
-```
-
-#### Supervised finetuning
-
-##### Instruction
-
 - `alpaca`: instruction; input(optional)
  ```json
  {"instruction": "...", "input": "...", "output": "..."}
  ```
+- `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: `system` to override default system prompt)
+  ```json
+  {"conversations": [{"from": "...", "value": "..."}]}
+  ```
+- `llama-2`: the json is the same format as `sharegpt` above, with the following config (see the [config section](#config) for more details)
+    ```yml
+    datasets:
+      - path: <your-path>
+        type: sharegpt
+        conversation: llama-2
+    ```
+- `completion`: raw corpus
+  ```json
+  {"text": "..."}
+  ```

 <details>

@@ -395,37 +362,14 @@ pretraining_dataset: # hf path only
  ```json
  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
  ```
- `metharme`: instruction, adds additional eos tokens
-  ```json
-  {"prompt": "...", "generation": "..."}
-  ```
-
-</details>
-
-##### Template-Free
-
- `input_output`: template-free prompt construction
-  ```json
-   {"segments": [{"label": true|false, "text": "..."}]}
-  ```
-
-This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.md) for more details.
-
-##### Conversation
-
- `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
-  ```json
-  {"conversations": [{"from": "...", "value": "..."}]}
-  ```
-
-<details>
-
-<summary>See other formats</summary>
-
 - `pygmalion`: pygmalion
  ```json
  {"conversations": [{"role": "...", "value": "..."}]}
  ```
+- `metharme`: instruction, adds additional eos tokens
+  ```json
+  {"prompt": "...", "generation": "..."}
+  ```
 - `sharegpt.load_role`: conversations where `role` is used instead of `from`
  ```json
  {"conversations": [{"role": "...", "value": "..."}]}
@@ -441,8 +385,6 @@ This is a special format that allows you to construct prompts without using temp

 </details>

-Note: `type: sharegpt` opens a special config `conversation:` that enables conversions to many Conversation types. See dataset section under [all yaml options](#all-yaml-options).
-
 #### How to add custom prompts

 For a dataset that is preprocessed for instruction purposes:
@@ -464,16 +406,12 @@ datasets:
      format: "[INST] {instruction} [/INST]"
      no_input_format: "[INST] {instruction} [/INST]"
 ```
-See full config options under [all yaml options](#all-yaml-options).

 #### How to use your custom pretokenized dataset

 - Do not pass a `type:`
 - Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`

-```yaml
- path: ...
-```

 ### Config

@@ -487,18 +425,22 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod

 - dataset
  ```yaml
-  datasets:
-      # huggingface repo
-    - path: vicgalle/alpaca-gpt4
-      type: alpaca
+  sequence_len: 2048 # max token length for prompt

-      # huggingface repo with specific configuration/subset
+  # huggingface repo
+  datasets:
+    - path: vicgalle/alpaca-gpt4
+      type: alpaca # format from earlier
+
+  # huggingface repo with specific configuration/subset
+  datasets:
    - path: EleutherAI/pile
      name: enron_emails
      type: completion # format from earlier
      field: text # Optional[str] default: text, field to use for completion data

-      # huggingface repo with multiple named configurations/subsets
+  # huggingface repo with multiple named configurations/subsets
+  datasets:
    - path: bigcode/commitpackft
      name:
        - ruby
@@ -506,29 +448,34 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
        - typescript
      type: ... # unimplemented custom format

-      # fastchat conversation
-      # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+  # fastchat conversation
+  # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+  datasets:
    - path: ...
      type: sharegpt
-      conversation: chatml # default: vicuna_v1.1
+      conversation: chatml

-      # local
+  # local
+  datasets:
    - path: data.jsonl # or json
      ds_type: json # see other options below
      type: alpaca

-      # dataset with splits, but no train split
+  # dataset with splits, but no train split
+  dataset:
    - path: knowrohit07/know_sql
      type: context_qa.load_v2
      train_on_split: validation

-      # loading from s3 or gcs
-      # s3 creds will be loaded from the system default and gcs only supports public access
+  # loading from s3 or gcs
+  # s3 creds will be loaded from the system default and gcs only supports public access
+  dataset:
    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
      ...

-      # Loading Data From a Public URL
-      # - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
+  # Loading Data From a Public URL
+  # - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
+  dataset:
    - path: https://some.url.com/yourdata.jsonl # The URL should be a direct link to the file you wish to load. URLs must use HTTPS protocol, not HTTP.
      ds_type: json # this is the default, see other options below.
  ```
@@ -537,11 +484,9 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
  ```yaml
  load_in_4bit: true
  load_in_8bit: true
-
  bf16: auto # require >=ampere, auto will detect if your GPU supports this and choose automatically.
  fp16: # leave empty to use fp16 when bf16 is 'auto'. set to false if you want to fallback to fp32
  tf32: true # require >=ampere
-
  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
  float16: true # use instead of fp16 when you don't want AMP
  ```
@@ -549,7 +494,7 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod

 - lora
  ```yaml
-  adapter: lora # 'qlora' or leave blank for full finetune
+  adapter: lora # qlora or leave blank for full finetune
  lora_r: 8
  lora_alpha: 16
  lora_dropout: 0.05
@@ -558,9 +503,9 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
    - v_proj
  ```

-<details id="all-yaml-options">
+<details>

-<summary>All yaml options (click to expand)</summary>
+<summary>All yaml options (click me)</summary>

 ```yaml
 # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
@@ -572,8 +517,8 @@ base_model_ignore_patterns:
 # You can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
 # You can specify to choose a specific model revision from huggingface hub
-revision_of_model:
-# Optional tokenizer configuration path in case you want to use a different tokenizer
+model_revision:
+# Optional tokenizer configuration override in case you want to use a different tokenizer
 # than the one defined in the base model
 tokenizer_config:
 # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
@@ -590,16 +535,15 @@ tokenizer_legacy:
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:

-# (Internal use only)
 # Used to identify which the model is based on
 is_falcon_derived_model:
 is_llama_derived_model:
-is_qwen_derived_model:
 # Please note that if you set this to true, `padding_side` will be set to "left" by default
 is_mistral_derived_model:
+is_qwen_derived_model:

 # optional overrides to the base model configuration
-overrides_of_model_config:
+model_config:
  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
  rope_scaling:
    type: # linear | dynamic
@@ -616,6 +560,8 @@ bnb_config_kwargs:

 # Whether you are training a 4-bit GPTQ quantized model
 gptq: true
+gptq_groupsize: 128 # group size
+gptq_model_v1: false # v1 or v2

 # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 load_in_8bit: true
@@ -651,13 +597,9 @@ datasets:
    train_on_split: train # Optional[str] name of dataset split to load from

    # Optional[str] fastchat conversation type, only used with type: sharegpt
-    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    field_human: # Optional[str]. Human key to use for conversation.
    field_model: # Optional[str]. Assistant key to use for conversation.
-    # Add additional keys from your dataset as input or output roles
-    roles:
-      input: # Optional[List[str]]. These will be masked based on train_on_input
-      output: # Optional[List[str]].

  # Custom user instruction prompt
  - path: repo
@@ -682,10 +624,6 @@ datasets:
      # For `completion` datsets only, uses the provided field instead of `text` column
      field:

-# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
-# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
-shuffle_merged_datasets: true
-
 # A list of one or more datasets to eval the model with.
 # You can use either test_datasets, or val_set_size, but not both.
 test_datasets:
@@ -697,7 +635,7 @@ test_datasets:
    data_files:
      - /workspace/data/eval.jsonl

-# use RL training: 'dpo', 'ipo', 'kto_pair'
+# use RL training: dpo, ipo, kto_pair
 rl:

 # Saves the desired chat template to the tokenizer_config.json for easier inferencing
@@ -717,7 +655,7 @@ dataset_processes: # defaults to os.cpu_count() if not set
 # Only needed if cached dataset is taking too much storage
 dataset_keep_in_memory:
 # push checkpoints to hub
-hub_model_id: # private repo path to push finetuned model
+hub_model_id: # repo path to push finetuned model
 # how to push checkpoints to hub
 # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
 hub_strategy:
@@ -796,8 +734,6 @@ peft:
 # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
 relora_steps: # Number of steps per ReLoRA restart
 relora_warmup_steps: # Number of per-restart warmup steps
-relora_anneal_steps: # Number of anneal steps for each relora cycle
-relora_prune_ratio: # threshold for optimizer magnitude when pruning
 relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings

 # wandb configuration if you're using it
@@ -813,7 +749,6 @@ wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_step
 # mlflow configuration if you're using it
 mlflow_tracking_uri: # URI to mlflow
 mlflow_experiment_name: # Your experiment name
-hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry

 # Where to save the full-finetuned model to
 output_dir: ./completed-model
@@ -847,8 +782,7 @@ save_total_limit: # Checkpoints saved at a time
 max_steps:

 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
-eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]
+eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128

 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
@@ -867,7 +801,7 @@ group_by_length: false
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
 # gradient_checkpointing_kwargs:
-#   use_reentrant: true
+#   use_reentrant: false

 # Stop training after this many evaluation losses have increased in a row
 # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
@@ -877,11 +811,14 @@ early_stopping_patience: 3
 lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
-cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)

 # For one_cycle optim
 lr_div_factor: # Learning rate div factor

+# For log_sweep optim
+log_sweep_min_lr:
+log_sweep_max_lr:
+
 # Specify optimizer
 # Valid values are driven by the Transformers OptimizerNames class, see:
 # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
@@ -907,26 +844,7 @@ lr_div_factor: # Learning rate div factor
 # - paged_adamw_8bit
 # - paged_lion_32bit
 # - paged_lion_8bit
-# - galore_adamw
-# - galore_adamw_8bit
-# - galore_adafactor
-# - galore_adamw_layerwise
-# - galore_adamw_8bit_layerwise
-# - galore_adafactor_layerwise
 optimizer:
-# Dictionary of arguments to pass to the optimizer
-optim_args:
-# For Galore Optimizers the following optim_args are available
-# rank:  # type: int
-# update_proj_gap  # type: int
-# scale  # type: float
-# proj_type:  # type: str, default = std
-
-# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
-optim_target_modules:
-# - self_attn  # for llama
-# - mlp
-
 # Specify weight decay
 weight_decay:
 # adamw hyperparams
@@ -1123,10 +1041,6 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

-##### FSDP + QLoRA
-
-Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.md) for more information.
-
 ##### Weights & Biases Logging

 Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
@@ -1188,7 +1102,7 @@ Please use `--sample_packing False` if you have it on and receive the error simi

 ### Merge LORA to base

-The following command will merge your LORA adapater with your base model. You can optionally pass the argument `--lora_model_dir` to specify the directory where your LORA adapter was saved, otherwhise, this will be inferred from `output_dir` in your axolotl config file.  The merged model is saved in the sub-directory `{lora_model_dir}/merged`.
+The following command will merge your LORA adapater with your base model.  You can optionally pass the argument `--lora_model_dir` to specify the directory where your LORA adapter was saved, otherwhise, this will be inferred from `output_dir` in your axolotl config file.  The merged model is saved in the sub-directory `{lora_model_dir}/merged`.

 ```bash
 python3 -m axolotl.cli.merge_lora your_config.yml --lora_model_dir="./completed-model"
@@ -1249,7 +1163,7 @@ If you decode a prompt constructed by axolotl, you might see spaces between toke

 1. Materialize some data using `python -m axolotl.cli.preprocess your_config.yml --debug`, and then decode the first few rows with your model's tokenizer.
 2. During inference, right before you pass a tensor of token ids to your model, decode these tokens back into a string.
-3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines.  If they aren't the same, adjust your inference server accordingly.
+3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines.  If they aren't the same adjust your inference server accordingly.
 4. As an additional troubleshooting step, you can look at the token ids between 1 and 2 to make sure they are identical.

 Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this.  See [this blog post](https://hamel.dev/notes/llm/05_tokenizer_gotchas.html) for a concrete example.
@@ -1296,20 +1210,11 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
-git clone https://github.com/OpenAccess-AI-Collective/axolotl
-cd axolotl
-
-pip3 install packaging
-pip3 install -e '.[flash-attn,deepspeed]'
-
 pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

 # test
 pytest tests/
-
-# optional: run against all files
-pre-commit run --all-files
 ```

 Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
@@ -1346,6 +1251,4 @@ consider sponsoring the project via [GitHub Sponsors](https://github.com/sponsor

 #### 🥉 Bronze Sponsors - $500/mo

- - [JarvisLabs.ai](https://jarvislabs.ai)
-
 ---
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,39 +0,0 @@
-FROM winglian/axolotl-base:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV BNB_CUDA_VERSION="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
-    fi
-
-# So we can test the Docker image
-RUN pip install pytest
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-pytest /workspace/axolotl/tests/e2e/patched/
-pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -1,75 +0,0 @@
-"""
- modal application to run axolotl gpu tests in Modal
- """
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import Image, Stub
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
-    "CUDA": os.environ.get("CUDA", "118"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = (
-    Image.from_dockerfile(
-        pathlib.Path(temp_dir) / "Dockerfile",
-        force_build=True,
-        gpu="A10G",
-    )
-    .env(df_args)
-    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
-)
-
-stub = Stub("Axolotl CI/CD", secrets=[])
-
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
-
-
-@stub.function(
-    image=cicd_image,
-    gpu=GPU_CONFIG,
-    timeout=45 * 60,
-    cpu=8.0,
-    memory=131072,
-)
-def cicd_pytest():
-    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
-
-
-@stub.local_entrypoint()
-def main():
-    cicd_pytest.remote()
--- a/deepspeed_configs/zero1.json
+++ b/deepspeed_configs/zero1.json
@@ -16,7 +16,6 @@
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero2.json
+++ b/deepspeed_configs/zero2.json
@@ -20,7 +20,6 @@
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
@@ -24,7 +24,6 @@
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -24,7 +24,6 @@
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/devtools/dev_sharegpt.yml
+++ b/devtools/dev_sharegpt.yml
@@ -2,6 +2,7 @@
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -3,10 +3,9 @@ FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
-ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
-ARG PYTORCH_VERSION="2.1.2"
+ARG PYTORCH_VERSION="2.0.1"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION

@@ -21,9 +20,9 @@ WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
    fi

 # So we can test the Docker image
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -7,8 +7,8 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION a

 ENV PATH="/root/miniconda3/bin:${PATH}"

-ARG PYTHON_VERSION="3.10"
-ARG PYTORCH_VERSION="2.1.2"
+ARG PYTHON_VERSION="3.9"
+ARG PYTORCH_VERSION="2.0.1"
 ARG CUDA="118"
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -3,10 +3,9 @@ FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
-ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
-ARG PYTORCH_VERSION="2.1.2"
+ARG PYTORCH_VERSION="2.0.1"
 ARG GITHUB_REF="main"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION
@@ -25,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
    fi

 # So we can test the Docker image
--- a/docs/debugging.md
+++ b/docs/debugging.md
@@ -74,6 +74,7 @@ pip3 install -e '.[flash-attn,deepspeed]'

 If you developing on a remote host, you can easily use VSCode to debug remotely.  To do so, you will need to follow this [remote - SSH guide](https://code.visualstudio.com/docs/remote/ssh).  You can also see the video below on [Docker and Remote SSH debugging](#video---attaching-to-docker-on-remote-host).

+```bash

 ### Configuration

--- a/docs/fsdp_qlora.md
+++ b/docs/fsdp_qlora.md
@@ -1,37 +0,0 @@
-# FDSP + QLoRA
-
-## Background
-
-Using FSDP with QLoRA is essential for **fine-tuning larger (70b+ parameter) LLMs on consumer GPUs.**  For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs[^1].
-
-Below, we describe how to use this feature in Axolotl.
-
-## Usage
-
-To enable `QLoRA` with `FSDP`, you need to perform the following steps:
-
-> ![Tip]
-> See the [example config](#example-config) file in addition to reading these instructions.
-
-1. Set `adapter: qlora` in your axolotl config file.
-2. Enable FSDP in your axolotl config, as [described here](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#fsdp).
-3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.
-
-## Example Config
-
-[examples/llama-2/qlora-fsdp.yml](../examples/llama-2/qlora-fsdp.yml) contains an example of how to enable QLoRA + FSDP in axolotl.
-
-## References
-
- [PR #1378](https://github.com/OpenAccess-AI-Collective/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl.
- [Blog Post](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the [Answer.AI](https://www.answer.ai/) team describing the work that enabled QLoRA in FSDP.
- Related HuggingFace PRs Enabling FDSP + QLoRA:
-    - Accelerate [PR#2544](https://github.com/huggingface/accelerate/pull/2544 )
-    - Transformers [PR#29587](https://github.com/huggingface/transformers/pull/29587)
-    - TRL [PR#1416](https://github.com/huggingface/trl/pull/1416)
-    - PEFT [PR#1550](https://github.com/huggingface/peft/pull/1550)
-
-
-
-
-[^1]: This was enabled by [this work](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the Answer.AI team.
--- a/docs/input_output.md
+++ b/docs/input_output.md
@@ -1,260 +0,0 @@
-# Template-free prompt construction with the `input_output` format
-
-<!-- TOC -->
-
- [Background](#background)
-    - [Masking Inputs](#masking-inputs)
-    - [You may not want prompt templates](#you-may-not-want-prompt-templates)
-    - [The `input_output` format](#the-input_output-format)
- [Usage](#usage)
-    - [1. Prepare Data](#1-prepare-data)
-    - [2. Use `type: input_output`](#2-use-type-input_output)
-    - [3. Check the prompts](#3-check-the-prompts)
-
-<!-- /TOC -->
-
-<a id="markdown-background" name="background"></a>
-
-## Background
-
-<a id="markdown-masking-inputs" name="masking-inputs"></a>
-
-### Masking Inputs
-
-One of the most popular features of
-[axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) is
-setting the following configuration value:
-
-
-```yaml
-train_on_inputs: false
-```
-
-If you declare a [dataset formats](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#dataset)
-such as `alpaca` or `chatml`, axolotl knows what is an input
-(i.e. human) vs. an output (i.e. the assistant) and masks the input
-labels so that your model can focus on predicting the outputs only.
-
-<a id="markdown-you-may-not-want-prompt-templates" name="you-may-not-want-prompt-templates"></a>
-
-### You may not want prompt templates
-
-However, there are many situations where you don't want to use one of
-these formats or templates (I usually don't!). This is because they can:
-
-   Add unnecessary boilerplate to your prompts.
-   Create artifacts like special delimiters `<|im_start|>` that can
-    quickly become footguns if you don't include them correctly at
-    inference time.
-   Enforce a *chat* interface when you do not want one. Sometimes you
-    just want to fine-tune a model to a very specific task and do NOT
-    want multi-turn conversations, roles, etc.
-   Limit you to only certain roles that the template allows.
-
-<a id="markdown-the-inputoutput-format" name="the-inputoutput-format"></a>
-
-### The `input_output` format
-
-You can construct your prompts without a template by using the
-`input_output` format, by setting `type: input_output` in your
-configuration file like this:
-
-**config.yml**
-
-```yaml
-train_on_inputs: false # Mask segments of your data
-datasets:
-  - path: output.jsonl
-    type: input_output  # use template free prompt construction
-```
-
-Unlike `type: completion`, which is also template-free,
-`type: input_output` allows you to mask segments of your text. More
-details on how this works are described below.
-
-<a id="markdown-usage" name="usage"></a>
-
-## Usage
-
-This is how you can use the `input_output` format:
-
-<a id="markdown-1-prepare-data" name="1-prepare-data"></a>
-
-### 1. Prepare Data
-
-To use the `input_output` format, collect your data in the following
-format into a jsonl file (below is the first row from the file
-`output`.jsonl` pretty printed):
-
-```bash
-$ head -n1 output.jsonl | python -m json.tool
-
-{.cell-output .cell-output-stdout}
-    {
-        "segments": [
-            {
-                "label": true,
-                "text": "<s>Hello\n"
-            },
-            {
-                "label": true,
-                "text": "hi there!. "
-            },
-            {
-                "label": false,
-                "text": "goodbye "
-            },
-            {
-                "label": true,
-                "text": "farewell</s>"
-            }
-        ]
-    }
-```
-
-Set `label:false` when you want to mask a segment of text so that the
-model isn't trained on it. Some things to keep in mind:
-
-> [!IMPORTANT]
-> 1.  **EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl
-    concatenates all the segments as-is.** The tokenizer doesn't add
-    anything additional. Notice how I added spaces, newlines, `<s>`
-    (BOS), and `</s>` (EOS) myself.
-> 2.  Make sure you check the materialized output to validate that the
-    prompt is getting assembled how you like.
-
-<a id="markdown-2-use-type-inputoutput" name="2-use-type-inputoutput"></a>
-
-### 2. Use `type: input_output`
-
-Let's materialize data with our `output.jsonl` file by setting
-`type: input_output` in our axolotl config:
-
-```yaml
-# training_config.yaml
-base_model: mistralai/Mistral-7B-v0.1
-data_seed: 49
-seed: 49
-
-datasets:
-  - path: output.jsonl
-    type: input_output
-val_set_size: 0.1
-
-sequence_len: 896
-sample_packing: false
-
-micro_batch_size: 2
-gradient_accumulation_steps: 3
-eval_batch_size: 2
-num_epochs: 1
-learning_rate: 0.0002
-
-train_on_inputs: false
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
-```
-
-You can use the following command to materialize your data. The
-`--debug` flag will print the tokens, along with the labels so you can
-verify that the correct items are being ignored:
-
-```bash
-$ python -m axolotl.cli.preprocess training_config.yaml --debug
-
-...
-[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] <s>(1, 1) Hello(22557, 22557)
-(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) </s>(2, 2)
-
-```
-
-The format is `decoded_token`(`label`, `token_id`), for example,
-`<s>(1, 1)` means that the token is `<s>`, the label is `1` and the
-token_id is `1`. When the label is `-100` then that token is ignored for
-training.
-
-<a id="markdown-3-check-the-prompts" name="3-check-the-prompts"></a>
-
-### 3. Check the prompts
-
-Here is another way to check the materialized output:
-
-```python
-from transformers import AutoTokenizer
-from datasets import load_from_disk
-import yaml
-
-directory = !ls last_run_prepared/
-with open('training_config.yaml', 'r') as f:
-    cfg = yaml.safe_load(f)
-model_id = cfg['base_model']
-tok = AutoTokenizer.from_pretrained(model_id)
-ds = load_from_disk(f'last_run_prepared/{directory[0]}/')
-```
-
-```python
->>> row = ds[0]
->>> print(tok.decode(row['input_ids']))
-<s> Hello
-    hi there!.  goodbye  farewell</s>
-```
-
-We can check that the right tokens are ingored by comparing the labels
-to each token:
-
-```python
-import pandas as pd
-pd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in
-              zip(row['input_ids'], row['labels'])])
-```
-
-| token | label | id    |
-|-------|-------|-------|
-| 0     | \<s\> | 1     |
-| 1     | Hello | 22557 |
-| 2     | \\n   | 13    |
-| 3     | hi    | 12014 |
-| 4     | there | 736   |
-| 5     | !     | 28808 |
-| 6     | .     | 28723 |
-| 7     |       | 28705 |
-| 8     | good  | -100  |
-| 9     | bye   | -100  |
-| 10    |       | -100  |
-| 11    | fare  | 19111 |
-| 12    | well  | 5458  |
-| 13    | \</s\>| 2     |
-
-
-
-If we look at the input data, the above table seems correct! (The jsonl
-version is repeated below for reference):
-
-
-```bash
-$ head -n1 output.jsonl | python -m json.tool
-
-{.cell-output .cell-output-stdout}
-    {
-        "segments": [
-            {
-                "label": true,
-                "text": "<s>Hello\n"
-            },
-            {
-                "label": true,
-                "text": "hi there!. "
-            },
-            {
-                "label": false,
-                "text": "goodbye "
-            },
-            {
-                "label": true,
-                "text": "farewell</s>"
-            }
-        ]
-    }
-```
--- a/docs/mac.md
+++ b/docs/mac.md
@@ -1,18 +0,0 @@
-# Mac M series support
-
-Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.
-
-Current support:
- [x] Support for all models
- [x] Full training of models
- [x] LoRA training
- [x] Sample packing
- [ ] FP16 and BF16 (awaiting AMP support for MPS in Pytorch)
- [ ] Tri-dao's flash-attn (until it is supported use spd_attention as an alternative)
- [ ] xformers
- [ ] bitsandbytes (meaning no 4/8 bits loading and bnb optimizers)
- [ ] qlora
- [ ] DeepSpeed
-
-Untested:
- FSDP
--- a/docs/optimizers.md
+++ b/docs/optimizers.md
@@ -1,29 +0,0 @@
-# Optimizers
-
-Optimizers are an important component when training LLMs. Optimizers are responsible for updating the model's weights (parameters) based on the gradients computed during backpropagation.
-The goal of an optimizer is to minimize the loss function.
-
-### Adam/AdamW Optimizers
-
-```yaml
-adam_beta1: 0.9
-adam_beta2: 0.999
-adam_epsilon: 1e-8
-weight_decay: 0.0
-```
-
-### GaLore Optimizer
-
-https://huggingface.co/papers/2403.03507
-
-```yaml
-optimizer: galore_adamw | galore_adamw_8bit | galore_adafactor
-optim_args:
-  rank: 128
-  update_proj_gap: 200
-  scale: 0.25
-  proj_type: std
-optim_target_modules:
-  - mlp
-  - attn
-```
--- a/docs/rlhf.md
+++ b/docs/rlhf.md
@@ -34,21 +34,6 @@ datasets:
 rl: ipo
 ```

-#### ORPO
-
-Paper: https://arxiv.org/abs/2403.07691
-
-```yaml
-rl: orpo
-orpo_alpha: 0.1
-remove_unused_columns: false
-
-chat_template: chatml
-datasets:
-  - path: argilla/ultrafeedback-binarized-preferences-cleaned
-    type: orpo.chat_template
-```
-
 #### Using local dataset files
 ```yaml
 datasets:
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -1,6 +1,7 @@
 base_model: codellama/CodeLlama-13b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -1,6 +1,7 @@
 base_model: codellama/CodeLlama-13b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -1,6 +1,7 @@
 base_model: codellama/CodeLlama-34b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -1,6 +1,7 @@
 base_model: codellama/CodeLlama-34b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -1,6 +1,7 @@
 base_model: codellama/CodeLlama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -1,6 +1,7 @@
 base_model: codellama/CodeLlama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -177,24 +177,6 @@
        "# Buy using the ! the comand will be executed as a bash command\n",
        "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Play with inference"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {},
-      "outputs": [],
-      "source": [
-        "# Buy using the ! the comand will be executed as a bash command\n",
-        "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
-        "    --qlora_model_dir=\"./qlora-out\" --gradio"
-      ]
    }
  ],
  "metadata": {
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -2,7 +2,7 @@ base_model: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-
+is_falcon_derived_model: true
 load_in_8bit: true
 load_in_4bit: false
 gptq: false
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -5,7 +5,7 @@ base_model: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-
+is_falcon_derived_model: true
 load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -2,7 +2,7 @@ base_model: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-
+is_falcon_derived_model: true
 load_in_8bit: false
 load_in_4bit: false
 gptq: false
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -1,65 +0,0 @@
-# use google/gemma-7b if you have access
-base_model: mhenrichsen/gemma-7b
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-# huggingface repo
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-val_set_size: 0.1
-output_dir: ./out
-
-adapter: qlora
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-
-gradient_accumulation_steps: 3
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: false
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -1,4 +1,5 @@
 base_model: TheBloke/Llama-2-7B-GPTQ
+is_llama_derived_model: false
 gptq: true
 gptq_disable_exllama: true
 model_type: AutoModelForCausalLM
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: false
@@ -59,7 +60,7 @@ s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -56,7 +57,7 @@ s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -1,70 +0,0 @@
-base_model: NousResearch/Llama-2-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: yahma/alpaca-cleaned
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 512
-sample_packing: false
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 4
-num_epochs: 4
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.00001
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-  - full_shard
-fsdp_config:
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-special_tokens:
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -1,7 +1,7 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -49,7 +49,7 @@ flash_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/mistral/Mistral-7b-example/config.yml
+++ b/examples/mistral/Mistral-7b-example/config.yml
@@ -2,6 +2,7 @@
 base_model: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
+is_mistral_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -60,7 +61,7 @@ flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 #default deepspeed, can use more aggresive if needed like zero2, zero3
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -1,6 +1,7 @@
 base_model: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
+is_mistral_derived_model: true

 load_in_8bit: false
 load_in_4bit: false
@@ -48,7 +49,7 @@ flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -1,79 +0,0 @@
-base_model: mistralai/Mistral-7B-v0.1
-model_type: MistralForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0
-output_dir: ./lora-out
-eval_sample_packing: false
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 1
-num_epochs: 2
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16: false
-tf32: true
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: false
-sdp_attention: true
-
-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_table_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -1,74 +0,0 @@
-base_model: mistralai/Mixtral-8x7B-v0.1
-model_type: AutoModelForCausalLM
-tokenizer_type: LlamaTokenizer
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.02
-output_dir: ./qlora-out
-
-model_config:
-  output_router_logits: true
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 1024
-sample_packing: false
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-weight_decay: 0.0
-fsdp:
-  - full_shard
-fsdp_config:
-  fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
-special_tokens:
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -16,12 +16,12 @@ output_dir: ./qlora-out

 ## You can optionally freeze the entire model and unfreeze a subset of parameters
 unfrozen_parameters:
-#  - ^lm_head.weight$
-#  - ^model.embed_tokens.weight$[:32000]
-#  - model.layers.2[0-9]+.block_sparse_moe.gate
-#  - model.layers.2[0-9]+.block_sparse_moe.experts
-#  - model.layers.3[0-9]+.block_sparse_moe.gate
-#  - model.layers.3[0-9]+.block_sparse_moe.experts
+#  - lm_head.*
+#  - model.embed_tokens.*
+#  - model.layers.2[0-9]+.block_sparse_moe.gate.*
+#  - model.layers.2[0-9]+.block_sparse_moe.experts.*
+#  - model.layers.3[0-9]+.block_sparse_moe.gate.*
+#  - model.layers.3[0-9]+.block_sparse_moe.experts.*

 model_config:
  output_router_logits: true
@@ -81,7 +81,7 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed: deepspeed_configs/zero2.json
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -1,6 +1,7 @@
 base_model: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
+is_mistral_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
@@ -67,7 +68,7 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -2,6 +2,7 @@ base_model: Qwen/Qwen-7B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

+is_qwen_derived_model: true
 trust_remote_code: true

 load_in_8bit: true
@@ -57,7 +58,7 @@ flash_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -2,6 +2,7 @@ base_model: Qwen/Qwen-7B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

+is_qwen_derived_model: true
 trust_remote_code: true

 load_in_8bit: false
@@ -57,7 +58,7 @@ flash_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/stablelm-2/1.6b/fft.yml
+++ b/examples/stablelm-2/1.6b/fft.yml
@@ -1,69 +0,0 @@
-base_model: stabilityai/stablelm-2-1_6b
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-adapter:
-lora_model_dir:
-lora_r:
-lora_alpha:
-lora_dropout:
-lora_target_linear:
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-flash_attn_cross_entropy: false
-flash_attn_rms_norm: true
-flash_attn_fuse_qkv: false
-flash_attn_fuse_mlp: true
-
-warmup_steps: 100
-evals_per_epoch: 4
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
-weight_decay: 0.1
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/stablelm-2/1.6b/lora.yml
+++ b/examples/stablelm-2/1.6b/lora.yml
@@ -1,66 +0,0 @@
-base_model: stabilityai/stablelm-2-1_6b
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-trust_remote_code: true
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./lora-out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-flash_attn_cross_entropy: false
-flash_attn_rms_norm: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/stablelm-2/README.md
+++ b/examples/stablelm-2/README.md
@@ -1,36 +0,0 @@
-# StableLM 2
-
-This repository contains examples for training and processing using StableLM-2. It also includes a section to help you estimate the GPU requirements for your specific use case.
-
-## Estimating GPU Requirements
-
-| type          | deepspeed | batch size | context length | vRAM GPU (GBs) |
-|---------------|-----------|------------|----------------|----------------|
-| full finetune | N/A       | 1          | 4096           | ~21.5GBs       |
-| full finetune | zero2     | 1          | 4096           | ~20GBs         |
-| lora          | N/A       | 1          | 4096           | ~16.6GBs       |
-
-The above are estimates and might differ slight depending on the setup for example whether you pack your sequence lengths or not (the above assumes you do to length 4096).
-
-This blog post from Hamel Husain was a great resource for estimating these numbers: https://hamel.dev/notes/llm/03_estimating_vram.html
-
-## Training
-We have example scripts here for both full finetuning and lora using the popular alpaca dataset:
-
-```shell
-# preprocess the dataset
-CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/stablelm-2/1.6b/lora.yml
-```
-
-Single GPU Training:
-```shell
-python -m axolotl.cli.train examples/stablelm-2/fft.yml --deepspeed deepspeed_configs/zero2.json
-# OR
-python -m axolotl.cli.train examples/stablelm-2/1.6b/lora.yml
-```
-
-Multinode GPU Training with `accelerate`:
-```shell
-# make sure you've configured accelerate properly
-accelerate launch -m axolotl.cli.train examples/stablelm-2/1.6b/fft.yml --deepspeed deepspeed_configs/zero2.json
-```
--- a/examples/starcoder2/qlora.yml
+++ b/examples/starcoder2/qlora.yml
@@ -1,69 +0,0 @@
-base_model: bigcode/starcoder2-3b
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-
-
-dataset_prepared_path:
-val_set_size: 0.2
-output_dir: ./qlora
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 8192
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 2
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 20
-evals_per_epoch: 4
-eval_steps:
-eval_table_size:
-saves_per_epoch: 4
-save_steps:
-save_total_limit: 2
-debug:
-deepspeed:
-weight_decay:
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/tiny-llama/lora-mps.yml
+++ b/examples/tiny-llama/lora-mps.yml
@@ -1,6 +1,7 @@
 base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
--- a/examples/tiny-llama/lora.yml
+++ b/examples/tiny-llama/lora.yml
@@ -1,6 +1,7 @@
 base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -15,7 +16,6 @@ output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
-eval_sample_packing: false
 pad_to_sequence_len: true

 adapter: lora
--- a/examples/tiny-llama/pretrain.yml
+++ b/examples/tiny-llama/pretrain.yml
@@ -2,6 +2,7 @@ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0

 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: false
@@ -9,9 +10,9 @@ strict: false

 max_steps: 200
 pretraining_dataset:
-  path: c4
-  name: en
-  type: pretrain
+  - path: c4
+    name: en
+    type: pretrain
 dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./model-out
--- a/examples/tiny-llama/qlora.yml
+++ b/examples/tiny-llama/qlora.yml
@@ -1,6 +1,7 @@
 base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
--- a/examples/yi-34B-chat/qlora.yml
+++ b/examples/yi-34B-chat/qlora.yml
@@ -1,7 +1,8 @@
 base_model: 01-ai/Yi-34B-Chat
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-
+is_mistral_derived_model: false
+is_llama_derived_model: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
@@ -28,7 +29,7 @@ num_epochs: 1
 val_set_size: 0.1
 evals_per_epoch: 5
 eval_table_size:
-eval_max_new_tokens: 128
+eval_table_max_new_tokens: 128
 eval_sample_packing: false
 eval_batch_size: 1

--- a/requirements.txt
+++ b/requirements.txt
@@ -1,18 +1,17 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft==0.9.0
-transformers @ git+https://github.com/huggingface/transformers.git@f6261d7d81edd036fc53bfede65fe91f01a661aa
+peft @ git+https://github.com/huggingface/peft.git
+transformers @ git+https://github.com/huggingface/transformers.git@bebeeee01275c32fccec3fa36d8b148d3813a7dc
 tokenizers==0.15.0
-bitsandbytes>=0.43.0
+bitsandbytes>=0.41.1
 accelerate==0.26.1
-deepspeed==0.13.1
-pydantic==2.6.3
+deepspeed>=0.13.1
 addict
 fire
 PyYAML>=6.0
 requests
 datasets>=2.15.0
-flash-attn==2.5.5
+flash-attn==2.3.3
 sentencepiece
 wandb
 einops
@@ -22,13 +21,14 @@ hf_transfer
 colorama
 numba
 numpy>=1.24.4
+mlflow
 # qlora things
-evaluate==0.4.1
+evaluate==0.4.0
 scipy
 scikit-learn==1.2.2
 pynvml
 art
-fschat==0.2.36
+fschat==0.2.34
 gradio==3.50.2
 tensorboard

@@ -39,8 +39,4 @@ s3fs
 gcsfs
 # adlfs

-trl @ git+https://github.com/huggingface/trl.git@304e208f778a5442c30cdda500348226cdc97d90
-fastcore>=1.5.29
-
-lpmm @ git+https://github.com/thu-ml/low-bit-optimizers.git@main
-yacs
+trl>=0.7.9
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,6 @@ def parse_requirements():
                or "flash-attention" in line
                or "deepspeed" in line
                or "mamba-ssm" in line
-                or "lion-pytorch" in line
            )
            if line.startswith("--extra-index-url"):
                # Handle custom index URLs
@@ -68,13 +67,13 @@ setup(
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.5.5",
+            "flash-attn==2.5.0",
        ],
        "fused-dense-lib": [
            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib",
        ],
        "deepspeed": [
-            "deepspeed==0.13.1",
+            "deepspeed>=0.13.1",
            "deepspeed-kernels",
        ],
        "mamba-ssm": [
@@ -83,14 +82,5 @@ setup(
        "auto-gptq": [
            "auto-gptq==0.5.1",
        ],
-        "mlflow": [
-            "mlflow",
-        ],
-        "lion-pytorch": [
-            "lion-pytorch==0.1.2",
-        ],
-        "galore": [
-            "galore_torch",
-        ],
    },
 )
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -13,6 +13,7 @@ from threading import Thread
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import urlparse

+import gradio as gr
 import requests
 import torch
 import yaml
@@ -23,7 +24,6 @@ from art import text2art
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
 from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
-from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
 from axolotl.logging_config import configure_logging
@@ -214,8 +214,6 @@ def do_inference_gradio(
    cfg: DictDefault,
    cli_args: TrainerCliArgs,
 ):
-    import gradio as gr
-
    model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
    prompter = cli_args.prompter
    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
@@ -330,6 +328,7 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
    # load the config from the yaml file
    with open(config, encoding="utf-8") as file:
        cfg: DictDefault = DictDefault(yaml.safe_load(file))
+    cfg.axolotl_config_path = config
    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
    # then overwrite the value
    cfg_keys = cfg.keys()
@@ -342,22 +341,7 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
            else:
                cfg[k] = kwargs[k]

-    cfg.axolotl_config_path = config
-
-    try:
-        device_props = torch.cuda.get_device_properties("cuda")
-        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
-    except:  # pylint: disable=bare-except # noqa: E722
-        gpu_version = None
-
-    cfg = validate_config(
-        cfg,
-        capabilities={
-            "bf16": is_torch_bf16_gpu_available(),
-            "n_gpu": os.environ.get("WORLD_SIZE", 1),
-            "compute_capability": gpu_version,
-        },
-    )
+    validate_config(cfg)

    prepare_optim_env(cfg)

--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -54,7 +54,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        LOG.warning(msg)
        parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

-    if parsed_cfg.rl and parsed_cfg.rl != "orpo":
+    if parsed_cfg.rl:
        load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
    else:
        load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -47,7 +47,7 @@ def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    else:
        register_chatml_template()

-    if cfg.rl and cfg.rl != "orpo":
+    if cfg.rl:
        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
    else:
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/core/policies/auto_wrap.py
+++ b/src/axolotl/core/policies/auto_wrap.py
@@ -1,55 +0,0 @@
-"""module for building the auto wrap policy for FSDP"""
-import functools
-
-from peft import PrefixEncoder, PromptEmbedding, PromptEncoder
-from torch.distributed.fsdp.wrap import (
-    _or_policy,
-    lambda_auto_wrap_policy,
-    transformer_auto_wrap_policy,
-)
-from transformers.models.llama.modeling_llama import LlamaDecoderLayer
-from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
-from transformers.models.mixtral.modeling_mixtral import MixtralDecoderLayer
-
-SUPPORTED_AUTO_WRAP_MODEL_TYPES = [
-    "llama",
-    "mistral",
-    "mixtral",
-]
-
-
-def get_wrapping_policy_factory(model_type):
-    if model_type == "llama":
-        layer_to_wrap = LlamaDecoderLayer
-    elif model_type == "mistral":
-        layer_to_wrap = MistralDecoderLayer
-    elif model_type == "mixtral":
-        layer_to_wrap = MixtralDecoderLayer
-
-    def get_wrapping_policy():
-        """This checks for lora layers (has weight and requires_grad)"""
-
-        def lambda_policy_fn(module):
-            return (
-                len(list(module.named_children())) == 0
-                and getattr(module, "weight", None) is not None
-                and module.weight.requires_grad
-            )
-
-        lambda_policy = functools.partial(
-            lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn
-        )
-        transformer_layer_name = layer_to_wrap
-        transformer_wrap_policy = functools.partial(
-            transformer_auto_wrap_policy,
-            transformer_layer_cls=(
-                PrefixEncoder,
-                PromptEncoder,
-                PromptEmbedding,
-                transformer_layer_name,
-            ),
-        )
-        policies = [lambda_policy, transformer_wrap_policy]
-        return functools.partial(_or_policy, policies=policies)
-
-    return get_wrapping_policy
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -5,52 +5,39 @@ Builder for the training args and trainer

 import abc
 import importlib
-import importlib.util
 import logging
 import math
-import os
 import sys
 from abc import abstractmethod
-from collections import defaultdict
 from dataclasses import dataclass, field
 from functools import wraps
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union
+from typing import List, Optional, Type, Union

-import lpmm
 import torch
 import transformers
-from accelerate import FullyShardedDataParallelPlugin
-from accelerate.utils import str_to_bool
 from datasets import Dataset
-from torch import nn
-from torch.distributed.fsdp import MixedPrecision
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from transformers import (
    EarlyStoppingCallback,
-    PreTrainedModel,
    Trainer,
    TrainerCallback,
    TrainingArguments,
 )
 from transformers.trainer_utils import seed_worker
-from transformers.utils import is_sagemaker_mp_enabled
 from trl import DPOTrainer

-from axolotl.core.policies.auto_wrap import get_wrapping_policy_factory
-from axolotl.core.trainers import OptimizerNames
-from axolotl.loraplus import create_loraplus_optimizer
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
    GPUStatsCallback,
    LossWatchDogCallback,
+    SaveAxolotlConfigtoMlflowCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
-    causal_lm_bench_eval_callback_factory,
    log_prediction_callback_factory,
 )
 from axolotl.utils.collators import (
@@ -63,15 +50,8 @@ from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.schedulers import (
    get_cosine_schedule_with_min_lr,
    get_cosine_schedule_with_quadratic_warmup,
-    get_cosine_schedule_with_warmup_decay_constant,
 )

-# monkeypatch so it accepts our custom optimizers
-transformers.training_args.OptimizerNames = OptimizerNames
-
-if is_sagemaker_mp_enabled():
-    import smdistributed.modelparallel.torch as smp
-
 try:
    import torch._dynamo  # pylint: disable=ungrouped-imports
 except ImportError:
@@ -80,10 +60,6 @@ except ImportError:
 LOG = logging.getLogger("axolotl.core.trainer_builder")


-def is_mlflow_available():
-    return importlib.util.find_spec("mlflow") is not None
-
-
 def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
    if isinstance(tag_names, str):
        tag_names = [tag_names]
@@ -155,10 +131,6 @@ class AxolotlTrainingArguments(TrainingArguments):
        default=None,
        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
    )
-    relora_prune_ratio: Optional[float] = field(
-        default=0.9,
-        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
-    )
    bench_split: Optional[str] = field(
        default="eval", metadata={"help": "The benchmark split to run on"}
    )
@@ -171,9 +143,6 @@ class AxolotlTrainingArguments(TrainingArguments):
    do_bench_eval: Optional[bool] = field(
        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
    )
-    do_causal_lm_eval: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
-    )
    max_bench_samples: Optional[int] = field(
        default=None,
        metadata={
@@ -191,26 +160,6 @@ class AxolotlTrainingArguments(TrainingArguments):
        default=None,
        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
    )
-    cosine_constant_lr_ratio: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
-        },
-    )
-    loraplus_lr_ratio: Optional[float] = field(
-        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
-    )
-    loraplus_lr_embedding: Optional[float] = field(
-        default=1e-6,
-        metadata={"help": "loraplus learning rate for lora embedding layers."},
-    )
-    qlora: bool = field(
-        default=False,
-        metadata={"help": "whether this is a qlora training"},
-    )
-    orpo_alpha: Optional[float] = field(
-        default=None,
-    )


 class AxolotlTrainer(Trainer):
@@ -227,122 +176,13 @@ class AxolotlTrainer(Trainer):
        num_epochs=1,
        bench_data_collator=None,
        eval_data_collator=None,
-        **kwargs,
+        **kwargs
    ):
        self.num_epochs = num_epochs
        self.bench_data_collator = bench_data_collator
        self.eval_data_collator = eval_data_collator
        super().__init__(*_args, **kwargs)
        self.train_data_collator = self.data_collator
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
-        if self.args.orpo_alpha:
-            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
-
-    @staticmethod
-    def get_optimizer_cls_and_kwargs(
-        args: TrainingArguments, model: Optional[PreTrainedModel] = None
-    ) -> Tuple[Any, Any]:
-        optim_args = {}
-        if args.optim_args:
-            for mapping in args.optim_args.replace(" ", "").split(","):
-                key, value = mapping.split("=")
-                optim_args[key] = value
-
-        optimizer_kwargs = {"lr": args.learning_rate}
-
-        adam_kwargs = {
-            "betas": (args.adam_beta1, args.adam_beta2),
-            "eps": args.adam_epsilon,
-        }
-
-        if args.optim in [
-            OptimizerNames.LPMM_ADAMW_4BIT,
-            OptimizerNames.LPMM_ADAMW_4BIT_FUSED,
-        ]:
-            optimizer_cls = lpmm.optim.AdamW
-            optimizer_kwargs.update(adam_kwargs)
-            if args.optim == OptimizerNames.LPMM_ADAMW_4BIT_FUSED:
-                optimizer_kwargs.update({"fused": True})
-            return optimizer_cls, optimizer_kwargs
-
-        return Trainer.get_optimizer_cls_and_kwargs(
-            args,
-            model=model,
-        )
-
-    def create_optimizer(self):
-        opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-
-        if self.optimizer is None:  # pylint: disable=access-member-before-definition
-            decay_parameters = self.get_decay_parameter_names(opt_model)
-            optimizer_grouped_parameters = [
-                {
-                    "params": [
-                        p
-                        for n, p in opt_model.named_parameters()
-                        if (n in decay_parameters and p.requires_grad)
-                    ],
-                    "weight_decay": self.args.weight_decay,
-                },
-                {
-                    "params": [
-                        p
-                        for n, p in opt_model.named_parameters()
-                        if (n not in decay_parameters and p.requires_grad)
-                    ],
-                    "weight_decay": 0.0,
-                },
-            ]
-
-            (
-                optimizer_cls,
-                optimizer_kwargs,
-            ) = AxolotlTrainer.get_optimizer_cls_and_kwargs(self.args)
-
-            if self.args.loraplus_lr_ratio:
-                loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
-                loraplus_lr_embedding = getattr(
-                    self.args, "loraplus_lr_embedding", None
-                )
-                self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
-                    opt_model,
-                    optimizer_cls,
-                    optimizer_kwargs,
-                    loraplus_lr_ratio,
-                    loraplus_lr_embedding,
-                )
-
-            else:
-                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
-                    optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-                )
-
-            if optimizer_cls.__name__ == "Adam8bit":
-                import bitsandbytes
-
-                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
-
-                skipped = 0
-                for module in opt_model.modules():
-                    if isinstance(module, nn.Embedding):
-                        skipped += sum(
-                            {
-                                p.data_ptr(): p.numel() for p in module.parameters()
-                            }.values()
-                        )
-                        LOG.info(f"skipped {module}: {skipped/2**20}M params")
-                        manager.register_module_override(
-                            module, "weight", {"optim_bits": 32}
-                        )
-                        LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
-                LOG.info(f"skipped: {skipped/2**20}M params")
-
-        if is_sagemaker_mp_enabled():
-            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
-                self.optimizer
-            )
-
-        return self.optimizer

    def create_scheduler(
        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
@@ -377,16 +217,6 @@ class AxolotlTrainer(Trainer):
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
                )
-            elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
-                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-                assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(  # pylint: disable=attribute-defined-outside-init
-                    optimizer,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                    num_training_steps=num_training_steps,
-                    min_lr_ratio=self.args.cosine_min_lr_ratio,
-                    constant_lr_ratio=self.args.cosine_constant_lr_ratio,
-                )
            elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
                self.lr_scheduler = get_cosine_schedule_with_min_lr(  # pylint: disable=attribute-defined-outside-init
@@ -558,112 +388,8 @@ class AxolotlTrainer(Trainer):
        #     outputs = model(**inputs)
        #     loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
        #     return (loss, outputs) if return_outputs else loss
-        if self.args.orpo_alpha:
-            return self.orpo_compute_loss(model, inputs, return_outputs=return_outputs)
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

-    def orpo_compute_custom_loss(self, logits, labels):
-        logits = logits.contiguous()
-        loss = 0.0
-
-        if labels is not None:
-            # move labels to correct device to enable model parallelism
-            labels = labels.to(logits.device)
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-
-            # Flatten the tokens
-            loss = self.loss_fct(shift_logits.transpose(2, 1), shift_labels).mean(
-                dim=-1
-            )
-
-        return loss
-
-    def orpo_compute_logps(
-        self, prompt_attention_mask, chosen_inputs, chosen_attention_mask, logits
-    ):
-        # Get the shape of chosen_attention_mask[:, :-1]
-        chosen_shape = chosen_attention_mask[:, :-1].shape
-
-        # Calculate the padding size
-        pad_length = chosen_shape[1] - (prompt_attention_mask.shape[1] - 1)
-
-        # Pad prompt_attention_mask with zeros to match the desired shape
-        prompt_attention_mask_padded = torch.nn.functional.pad(
-            prompt_attention_mask[:, 1:], (0, pad_length), mode="constant", value=0
-        )
-
-        # Perform the subtraction operation
-        mask = chosen_attention_mask[:, :-1] > prompt_attention_mask_padded
-
-        per_token_logps = torch.gather(
-            logits[:, :-1, :].log_softmax(-1),
-            dim=2,
-            index=(mask * chosen_inputs[:, 1:]).unsqueeze(2),
-        ).squeeze(2)
-        return torch.mul(per_token_logps, mask.to(dtype=torch.bfloat16)).sum(dim=1).to(
-            dtype=torch.float64
-        ) / mask.sum(dim=1).to(dtype=torch.float64)
-
-    def orpo_compute_loss(self, model, inputs, return_outputs=False):
-        outputs_neg = model(
-            **{
-                "input_ids": inputs["rejected_input_ids"],
-                "attention_mask": inputs["rejected_attention_mask"],
-                "labels": inputs["rejected_labels"],
-            },
-            output_hidden_states=True,
-        )
-        outputs_pos = model(
-            **{
-                "input_ids": inputs["input_ids"],
-                "attention_mask": inputs["attention_mask"],
-                "labels": inputs["labels"],
-            },
-            output_hidden_states=True,
-        )
-
-        # Calculate NLL loss
-        pos_loss = self.orpo_compute_custom_loss(
-            logits=outputs_pos.logits, labels=inputs["input_ids"]
-        )
-
-        # Calculate Log Probability
-        pos_prob = self.orpo_compute_logps(
-            prompt_attention_mask=inputs["prompt_attention_mask"],
-            chosen_inputs=inputs["input_ids"],
-            chosen_attention_mask=inputs["attention_mask"],
-            logits=outputs_pos.logits,
-        )
-        neg_prob = self.orpo_compute_logps(
-            prompt_attention_mask=inputs["prompt_attention_mask"],
-            chosen_inputs=inputs["rejected_input_ids"],
-            chosen_attention_mask=inputs["rejected_attention_mask"],
-            logits=outputs_neg.logits,
-        )
-
-        # Calculate log odds
-        log_odds = (pos_prob - neg_prob) - (
-            torch.log(1 - torch.exp(pos_prob)) - torch.log(1 - torch.exp(neg_prob))
-        )
-        sig_ratio = torch.nn.functional.sigmoid(log_odds)
-        ratio = torch.log(sig_ratio)
-
-        # Calculate the Final Loss
-        loss = torch.mean(pos_loss - self.args.orpo_alpha * ratio).to(
-            dtype=torch.bfloat16
-        )
-
-        metrics = {}
-        metrics["chosen_geometric_mean"] = torch.mean(pos_prob).cpu().item()
-        metrics["rejected_geometric_mean"] = torch.mean(neg_prob).cpu().item()
-        metrics["log_odds_ratio"] = torch.mean(ratio).cpu().item()
-        metrics["log_odds"] = torch.mean(log_odds).cpu().item()
-        self.store_metrics(metrics, train_eval="train")
-
-        return (loss, outputs_pos) if return_outputs else loss
-
    @wraps(Trainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
@@ -674,78 +400,6 @@ class AxolotlTrainer(Trainer):

        return super().push_to_hub(*args, **kwargs)

-    @wraps(Trainer.create_accelerator_and_postprocess)
-    def create_accelerator_and_postprocess(self):
-        rank = int(os.environ.get("LOCAL_RANK", 0))
-        res = super().create_accelerator_and_postprocess()
-
-        if self.args.qlora is False:
-            return res
-
-        # the rest of this method override is specific to fsdp + qlora (for now)
-        sync_module_states = (
-            str_to_bool(os.environ.get("FSDP_SYNC_MODULE_STATES", "True")) == 1
-        )
-
-        mp_policy = None
-        amp = os.environ["ACCELERATE_MIXED_PRECISION"]
-        if amp == "fp16":
-            mp_policy = MixedPrecision(
-                param_dtype=torch.float32,
-                reduce_dtype=torch.float32,
-                buffer_dtype=torch.float32,
-            )
-        elif amp == "bf16":
-            mp_policy = MixedPrecision(
-                param_dtype=torch.float32,
-                reduce_dtype=torch.float32,
-                buffer_dtype=torch.float32,
-            )
-
-        # If somehow we figure out how we want to parameterize we want to autocast buffers...
-        # mp_policy = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.bfloat16, buffer_dtype=torch.float32)
-        # load_param_skip_names = ['inv_freq']
-
-        if self.is_fsdp_enabled:
-            wrapping_policy = get_wrapping_policy_factory(self.args.model_type)
-            fsdp_plugin = FullyShardedDataParallelPlugin(
-                auto_wrap_policy=wrapping_policy(),
-                cpu_offload=False,
-                use_orig_params=False,
-                limit_all_gathers=True,
-                param_init_fn=lambda module: module.to_empty(
-                    device=torch.device("cuda"), recurse=False
-                )
-                if (rank != 0 and sync_module_states)
-                else None,
-                mixed_precision_policy=mp_policy,
-            )
-            self.accelerator.state.fsdp_plugin = fsdp_plugin
-
-        return res
-
-    def log(self, logs: Dict[str, float]) -> None:
-        """
-        Log `logs` on the various objects watching training, including stored metrics.
-
-        Args:
-            logs (`Dict[str, float]`):
-                The values to log.
-        """
-        # logs either has 'loss' or 'eval_loss'
-        train_eval = "train" if "loss" in logs else "eval"
-        # Add averaged stored metrics to logs
-        for key, metrics in self._stored_metrics[train_eval].items():
-            logs[key] = torch.tensor(metrics).mean().item()
-        del self._stored_metrics[train_eval]
-        return super().log(logs)
-
-    def store_metrics(
-        self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train"
-    ) -> None:
-        for key, value in metrics.items():
-            self._stored_metrics[train_eval][key].append(value)
-

 class AxolotlMambaTrainer(AxolotlTrainer):
    """
@@ -969,11 +623,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
            )
-        if self.cfg.use_mlflow and is_mlflow_available():
-            from axolotl.utils.callbacks.mlflow_ import (
-                SaveAxolotlConfigtoMlflowCallback,
-            )
-
+        if self.cfg.use_mlflow:
            callbacks.append(
                SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
            )
@@ -993,11 +643,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        if self.cfg.do_bench_eval:
            callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer))
-        if self.cfg.do_causal_lm_eval:
-            CausalLMBenchEvalCallback = causal_lm_bench_eval_callback_factory(
-                trainer, self.tokenizer
-            )
-            callbacks.append(CausalLMBenchEvalCallback(self.cfg))

        if self.cfg.early_stopping_patience:
            early_stop_cb = EarlyStoppingCallback(
@@ -1056,14 +701,15 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                training_arguments_kwargs[
                    "gradient_checkpointing_kwargs"
                ] = self.cfg.gradient_checkpointing_kwargs
+            else:
+                training_arguments_kwargs["gradient_checkpointing_kwargs"] = {
+                    "use_reentrant": False
+                }
        if self.cfg.fsdp:
            training_arguments_kwargs["fsdp"] = self.cfg.fsdp
            if self.cfg.fsdp_config:
                training_arguments_kwargs["fsdp_config"] = dict(self.cfg.fsdp_config)

-        if self.cfg.adapter == "qlora":
-            training_arguments_kwargs["qlora"] = True
-
        # deepspeed
        if self.cfg.deepspeed:
            training_arguments_kwargs["deepspeed"] = self.cfg.deepspeed
@@ -1118,11 +764,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
            training_arguments_kwargs["dataloader_drop_last"] = True

-        if self.cfg.remove_unused_columns is not None:
-            training_arguments_kwargs[
-                "remove_unused_columns"
-            ] = self.cfg.remove_unused_columns
-
        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
            # no eval set, so don't eval
            training_arguments_kwargs["evaluation_strategy"] = "no"
@@ -1150,8 +791,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs["do_bench_eval"] = self.cfg.do_bench_eval
            if self.cfg.bench_dataset:
                training_arguments_kwargs["bench_dataset"] = self.cfg.bench_dataset
-        if self.cfg.do_causal_lm_eval:
-            training_arguments_kwargs["do_causal_lm_eval"] = self.cfg.do_causal_lm_eval
        if self.cfg.metric_for_best_model:
            training_arguments_kwargs[
                "metric_for_best_model"
@@ -1212,10 +851,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.load_best_model_at_end is not False
                or self.cfg.early_stopping_patience
            )
-            and (
-                (not self.cfg.test_datasets and self.cfg.val_set_size > 0)
-                or (self.cfg.test_datasets and self.cfg.val_set_size == 0)
-            )
+            and not self.cfg.test_datasets
+            and self.cfg.val_set_size > 0
            and self.cfg.save_steps
            and self.cfg.eval_steps
            and self.cfg.save_steps % self.cfg.eval_steps == 0
@@ -1236,22 +873,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["optim"] = (
            self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
        )
-        if self.cfg.optim_args:
-            if isinstance(self.cfg.optim_args, dict):
-                optim_args = ",".join(
-                    [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
-                )
-            else:
-                optim_args = self.cfg.optim_args
-            training_arguments_kwargs["optim_args"] = optim_args
-        if self.cfg.optim_target_modules:
-            training_arguments_kwargs[
-                "optim_target_modules"
-            ] = self.cfg.optim_target_modules
-        training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio
-        training_arguments_kwargs[
-            "loraplus_lr_embedding"
-        ] = self.cfg.loraplus_lr_embedding
        training_arguments_kwargs["lr_scheduler_type"] = (
            self.cfg.lr_scheduler
            if self.cfg.lr_scheduler
@@ -1262,9 +883,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
        )
        training_arguments_kwargs["cosine_min_lr_ratio"] = self.cfg.cosine_min_lr_ratio
-        training_arguments_kwargs[
-            "cosine_constant_lr_ratio"
-        ] = self.cfg.cosine_constant_lr_ratio
        training_arguments_kwargs["weight_decay"] = (
            self.cfg.weight_decay if self.cfg.weight_decay is not None else 0.0
        )
@@ -1282,70 +900,32 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs[
            "sample_packing_seq_len_multiplier"
        ] = self.cfg.micro_batch_size
-        if self.cfg.relora_steps:
-            training_arguments_kwargs["relora_steps"] = self.cfg.relora_steps
-            training_arguments_kwargs[
-                "relora_warmup_steps"
-            ] = self.cfg.relora_warmup_steps
-            if self.cfg.relora_anneal_steps:
-                training_arguments_kwargs[
-                    "relora_anneal_steps"
-                ] = self.cfg.relora_anneal_steps
-            if self.cfg.relora_prune_ratio:
-                training_arguments_kwargs[
-                    "relora_prune_ratio"
-                ] = self.cfg.relora_prune_ratio
-
+        training_arguments_kwargs["relora_steps"] = self.cfg.relora_steps
+        training_arguments_kwargs["relora_warmup_steps"] = self.cfg.relora_warmup_steps
+        training_arguments_kwargs["relora_anneal_steps"] = self.cfg.relora_anneal_steps
        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
        training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)

-        if self.cfg.rl == "orpo":
-            training_arguments_kwargs["orpo_alpha"] = self.cfg.orpo_alpha
-
        if self.cfg.neftune_noise_alpha is not None:
            training_arguments_kwargs[
                "neftune_noise_alpha"
            ] = self.cfg.neftune_noise_alpha

-        trainer_kwargs = {}
-
-        if self.cfg.optimizer == "lion_pytorch":
-            from lion_pytorch import Lion
-
-            lion_kwargs = {"lr": training_arguments_kwargs["learning_rate"]}
-            if "weight_decay" in training_arguments_kwargs:
-                lion_kwargs["weight_decay"] = training_arguments_kwargs["weight_decay"]
-
-            if (
-                "adam_beta1" in training_arguments_kwargs
-                and "adam_beta2" in training_arguments_kwargs
-            ):
-                lion_kwargs["betas"] = (
-                    training_arguments_kwargs["adam_beta1"],
-                    training_arguments_kwargs["adam_beta2"],
-                )
-
-            trainer_kwargs["optimizers"] = (
-                Lion(params=self.model.parameters(), **lion_kwargs),
-                None,
-            )
-            # Set default so transformers doesn't throw
-            training_arguments_kwargs["optim"] = "adamw_hf"
-
-        if self.cfg.optimizer == "adamw_anyprecision":
-            if Path(self.cfg.torchdistx_path).exists():
-                sys.path.append(self.cfg.torchdistx_path)
-                importlib.import_module("torchdistx")
-
        training_args = (
            AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
                **training_arguments_kwargs,
            )
        )
        training_args = self.hook_post_create_training_args(training_args)
+        trainer_kwargs = {}
+
+        if self.cfg.optimizer == "adamw_anyprecision":
+            if Path(self.cfg.torchdistx_path).exists():
+                sys.path.append(self.cfg.torchdistx_path)
+                importlib.import_module("torchdistx")

        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -1,40 +0,0 @@
-"""module for trainer helpers like OptimizerNames"""
-
-from transformers.utils import ExplicitEnum
-
-
-class OptimizerNames(ExplicitEnum):
-    """
-    Stores the acceptable string identifiers for optimizers.
-    """
-
-    ADAMW_HF = "adamw_hf"
-    ADAMW_TORCH = "adamw_torch"
-    ADAMW_TORCH_FUSED = "adamw_torch_fused"
-    ADAMW_TORCH_XLA = "adamw_torch_xla"
-    ADAMW_TORCH_NPU_FUSED = "adamw_torch_npu_fused"
-    ADAMW_APEX_FUSED = "adamw_apex_fused"
-    ADAFACTOR = "adafactor"
-    ADAMW_ANYPRECISION = "adamw_anyprecision"
-    SGD = "sgd"
-    ADAGRAD = "adagrad"
-    ADAMW_BNB = "adamw_bnb_8bit"
-    ADAMW_8BIT = "adamw_8bit"  # just an alias for adamw_bnb_8bit
-    LION_8BIT = "lion_8bit"
-    LION = "lion_32bit"
-    PAGED_ADAMW = "paged_adamw_32bit"
-    PAGED_ADAMW_8BIT = "paged_adamw_8bit"
-    PAGED_LION = "paged_lion_32bit"
-    PAGED_LION_8BIT = "paged_lion_8bit"
-    RMSPROP = "rmsprop"
-    RMSPROP_BNB = "rmsprop_bnb"
-    RMSPROP_8BIT = "rmsprop_bnb_8bit"
-    RMSPROP_32BIT = "rmsprop_bnb_32bit"
-    GALORE_ADAMW = "galore_adamw"
-    GALORE_ADAMW_8BIT = "galore_adamw_8bit"
-    GALORE_ADAFACTOR = "galore_adafactor"
-    GALORE_ADAMW_LAYERWISE = "galore_adamw_layerwise"
-    GALORE_ADAMW_8BIT_LAYERWISE = "galore_adamw_8bit_layerwise"
-    GALORE_ADAFACTOR_LAYERWISE = "galore_adafactor_layerwise"
-    LPMM_ADAMW_4BIT = "lmpp_adamw_4bit"
-    LPMM_ADAMW_4BIT_FUSED = "lmpp_adamw_4bit_fused"
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -30,7 +30,6 @@ class ColorfulFormatter(Formatter):

 DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
    "version": 1,
-    "disable_existing_loggers": False,
    "formatters": {
        "simple": {
            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
--- a/src/axolotl/loraplus.py
+++ b/src/axolotl/loraplus.py
@@ -1,133 +0,0 @@
-"""Module for LoRA+"""
-
-# MIT License
-#
-# Copyright (c) 2024 nikhil-ghosh-berkeley
-# https://github.com/nikhil-ghosh-berkeley/loraplus
-
-import logging
-from functools import reduce
-
-from peft.tuners import lora
-from torch import nn
-from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
-from transformers.trainer_pt_utils import get_parameter_names
-
-LOG = logging.getLogger("axolotl.loraplus")
-
-
-def get_module(name, opt_model):
-    """
-    Retrieve a module from a model using its parameter name.
-    Args:
-        name (str): Full name of the parameter, typically including module path.
-        opt_model (torch.nn.Module): The model from which to retrieve the module.
-
-    Returns:
-        Module corresponding to the given name.
-    """
-    parent_idx = 2 if "lora" in name else 1
-    module_names = name.split(sep=".")[:-parent_idx]
-    module = reduce(getattr, module_names, opt_model)
-    return module
-
-
-def create_loraplus_optimizer(
-    opt_model,
-    optimizer_cls,
-    optimizer_kwargs,
-    loraplus_lr_ratio,
-    loraplus_lr_embedding=None,
-):
-    """
-    Creates an optimizer for the given model, applying LoRA-specific learning rate adjustments to different parameter groups.
-
-    Args:
-        opt_model (torch.nn.Module): The model for which the optimizer is being created.
-        optimizer_cls (class): The class of the optimizer to be used (e.g., torch.optim.Adam).
-        optimizer_kwargs (dict): A dictionary of keyword arguments for the optimizer's initialization.
-        loraplus_lr_ratio (float): The learning rate ratio to be applied to LoRA parameters.
-        loraplus_lr_embedding (float, optional): A specific learning rate for embedding parameters, with a default value if not provided.
-
-    Returns:
-        An instance of the specified optimizer class configured with the model's parameters organized into groups with custom learning rates.
-    """
-
-    assert loraplus_lr_ratio is not None, "loraplus_lr_ratio must be provided."
-
-    if loraplus_lr_embedding is None:
-        loraplus_lr_embedding = 1e-6
-
-    decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
-    decay_parameters = [name for name in decay_parameters if "bias" not in name]
-    param_groups = {
-        "groupA": {},
-        "groupB": {},
-        "groupB_no_decay": {},
-        "embedding": {},
-    }
-
-    for name, param in opt_model.named_parameters():
-        if not param.requires_grad:
-            continue
-
-        module = get_module(name, opt_model)
-        if isinstance(module, lora.Embedding):
-            param_groups["embedding"][name] = param
-        elif "lora_B" in name or param.ndim == 1:
-            if name in decay_parameters:
-                param_groups["groupB"][name] = param
-            else:
-                param_groups["groupB_no_decay"][name] = param
-        else:
-            param_groups["groupA"][name] = param
-
-    assigned_param_groups = ""
-    for group, group_params in param_groups.items():
-        assigned_param_groups += f"{group}\n {list(group_params.keys())}\n\n"
-    LOG.info(assigned_param_groups)
-
-    lr = optimizer_kwargs["lr"]  # pylint: disable=invalid-name
-    weight_decay = optimizer_kwargs.get("weight_decay", 0.0)
-
-    optimizer_grouped_parameters = [
-        {
-            "params": list(param_groups["groupA"].values()),
-            "weight_decay": weight_decay,
-            "lr": lr,
-        },
-        {
-            "params": list(param_groups["embedding"].values()),
-            "weight_decay": weight_decay,
-            "lr": loraplus_lr_embedding,
-        },
-        {
-            "params": list(param_groups["groupB"].values()),
-            "weight_decay": weight_decay,
-            "lr": lr * loraplus_lr_ratio,
-        },
-        {
-            "params": list(param_groups["groupB_no_decay"].values()),
-            "weight_decay": 0.0,
-            "lr": lr * loraplus_lr_ratio,
-        },
-    ]
-
-    optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-    if optimizer_cls.__name__ == "Adam8bit":
-        import bitsandbytes
-
-        manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
-
-        skipped = 0
-        for module in opt_model.modules():
-            if isinstance(module, nn.Embedding):
-                skipped += sum(
-                    {p.data_ptr(): p.numel() for p in module.parameters()}.values()
-                )
-                LOG.info(f"skipped {module}: {skipped/2**20}M params")
-                manager.register_module_override(module, "weight", {"optim_bits": 32})
-                LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
-        LOG.info(f"skipped: {skipped/2**20}M params")
-
-    return optimizer
--- a/src/axolotl/monkeypatch/fastchat_conversation_turns.py
+++ b/src/axolotl/monkeypatch/fastchat_conversation_turns.py
@@ -106,7 +106,7 @@ def get_turns(  # pylint: disable=too-many-return-statements
        if self.system_message:
            contains_sys_msg = True
            if self.messages:
-                # There is no clear guidance on how to handle system messages in Mistral so we just prepend it to the first human instruction separated by a newline
+                # There is no clear guidance on how to handle system messages in Mistral so we just prepend it to the first human instruction seperated by a newline
                first_role, first_msg = self.messages[0]
                if first_role == self.roles[0]:
                    system_prompt = self.system_template.format(
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -44,18 +44,6 @@ except ImportError:
 LOG = logging.getLogger("axolotl")


-def is_xformers_swiglu_available() -> bool:
-    from xformers.ops.common import get_xformers_operator
-
-    try:
-        get_xformers_operator("swiglu_packedw")()
-        return True
-    except RuntimeError as exc:
-        if "No such operator xformers::swiglu_packedw " in str(exc):
-            return False
-        return True
-
-
 def replace_llama_mlp_with_swiglu(model):
    for name, module in model.named_modules():
        if isinstance(module, LlamaMLP):
@@ -287,9 +275,7 @@ def flashattn_forward_with_s2attn(
    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(
-        value_states, seq_len=kv_seq_len, position_ids=position_ids
-    )
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
@@ -439,9 +425,7 @@ def flashattn_forward(
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]

-    cos, sin = self.rotary_emb(
-        value_states, seq_len=kv_seq_len, position_ids=position_ids
-    )
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
@@ -704,9 +688,6 @@ def llama_model_forward(
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
-    cache_position: Optional[  # pylint: disable=unused-argument
-        torch.LongTensor
-    ] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
    output_attentions = (
        output_attentions
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -1,26 +1,15 @@
 """multipack patching for v2 of sample packing"""
-import importlib

 import transformers
-from accelerate import init_empty_weights
-from transformers import AutoConfig, AutoModelForCausalLM
 from transformers.integrations import is_deepspeed_zero3_enabled

 from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3
 from axolotl.monkeypatch.utils import get_unpad_data

-SUPPORTED_MULTIPACK_MODEL_TYPES = [
-    "mixtral",
-    "qwen2",
-    "falcon",
-    "phi",
-    "gemma",
-    "gemmoe",
-    "starcoder2",
-]
+SUPPORTED_MULTIPACK_MODEL_TYPES = ["mixtral", "qwen2", "falcon", "phi"]


-def patch_for_multipack(model_type, model_name=None):
+def patch_for_multipack(model_type):
    if model_type == "mixtral":
        transformers.models.mixtral.modeling_mixtral._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
@@ -39,23 +28,3 @@ def patch_for_multipack(model_type, model_name=None):
        transformers.models.phi.modeling_phi._get_unpad_data = (  # pylint: disable=protected-access
            get_unpad_data
        )
-    elif model_type == "gemma":
-        transformers.models.gemma.modeling_gemma._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "starcoder2":
-        transformers.models.starcoder2.modeling_starcoder2._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
-    elif model_type == "gemmoe":
-        model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-        # we need to load the model here in order for modeling_gemmoe to be available
-        with init_empty_weights():
-            AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
-        module_name = model_config.__class__.__module__.replace(
-            ".configuration_gemmoe", ".modeling_gemmoe"
-        )
-        modeling_gemmoe = importlib.import_module(module_name)
-        modeling_gemmoe._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -46,9 +46,8 @@ def reset_optimizer(
    *,
    reset_params: list[str],  # where str is the key to a torch.nn.Parameter
    optimizer_state_keys: list[str],
-    prune_ratio: float = 0.9,
 ):
-    pruning_fn = partial(magnitude_pruning_, prune_ratio=prune_ratio)
+    pruning_fn = partial(magnitude_pruning_, prune_ratio=0.9)
    n_zeros = 0
    n_total = 0

@@ -160,7 +159,6 @@ class ReLoRACallback(TrainerCallback):
                    optimizer,
                    reset_params=lora_params,
                    optimizer_state_keys=optimizer_state_keys,
-                    prune_ratio=args.relora_prune_ratio,
                )

            if self.quantized:
@@ -267,7 +265,7 @@ class ReLoRAScheduler(LRScheduler):
        original = self.inner_schedule.get_lr()
        step = self.last_epoch

-        if step < self.relora_steps - self.warmup_steps:
+        if step < self.relora_steps:
            scale = 1
        else:
            per_relora_progress = step % self.relora_steps
--- a/src/axolotl/plugins/oaaic/init.py
+++ b/src/axolotl/plugins/oaaic/init.py
--- a/src/axolotl/utils/config/models/init.py
+++ b/src/axolotl/utils/config/models/init.py
--- a/src/axolotl/plugins/oaaic/data/streaming_sql.py
+++ b/src/axolotl/plugins/oaaic/data/streaming_sql.py
@@ -0,0 +1,28 @@
+import os
+from typing import Callable, Generator, Tuple
+
+import psycopg
+import psycopg.conninfo
+
+
+def pgsql(pgsql_table=None, id_field="id", **kwargs) -> Callable:
+    pgsql_conn = os.environ.get("PGSQL_CONN", None)
+    if not pgsql_conn:
+        raise ValueError("missing PGSQL_CONN environment variable")
+    conn_dict = psycopg.conninfo.conninfo_to_dict(pgsql_conn)
+
+    def data_generator() -> Generator[Tuple, None, None]:
+        with psycopg.connect(**conn_dict) as conn:
+            with conn.cursor() as cur:
+                page_size = 10
+                last_id = None
+                while True:
+                    if last_id:
+                        where_clause = f" WHERE {id_field} > {last_id}"
+                    cur.execute(
+                        f"SELECT * FROM {pgsql_table}{where_clause} ORDER BY {id_field} ASC LIMIT {page_size}"
+                    )
+                    for row in cur.fetchall():
+                        yield row[id_field], dict(row)
+
+    return data_generator
--- a/src/axolotl/prompt_strategies/base.py
+++ b/src/axolotl/prompt_strategies/base.py
@@ -1,20 +0,0 @@
-"""
-module for base dataset transform strategies
-"""
-
-import importlib
-import logging
-
-LOG = logging.getLogger("axolotl")
-
-
-def load(strategy, cfg, module_base=None, **kwargs):
-    try:
-        load_fn = strategy.split(".")[-1]
-        strategy = ".".join(strategy.split(".")[:-1])
-        mod = importlib.import_module(f".{strategy}", module_base)
-        func = getattr(mod, load_fn)
-        return func(cfg, **kwargs)
-    except Exception:  # pylint: disable=broad-exception-caught
-        LOG.warning(f"unable to load strategy {strategy}")
-        return None
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -1,78 +0,0 @@
-"""
-HF Chat Templates prompt strategy
-"""
-from typing import Any, Dict, Optional
-
-from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import Prompter
-from axolotl.utils.chat_templates import chat_templates
-
-
-class ChatTemplatePrompter(Prompter):
-    """prompter for HF chat templates"""
-
-    def __init__(self, tokenizer, chat_template=None, max_length=2048):
-        self.tokenizer = tokenizer
-        self.chat_template = chat_template
-        self.max_length = max_length
-
-    def build_prompt(self, conversation, add_generation_prompt=False):
-        return self.tokenizer.apply_chat_template(
-            conversation,
-            truncation=True,
-            max_length=self.max_length,
-            add_generation_prompt=add_generation_prompt,
-            chat_template=self.chat_template,
-        )
-
-
-class ChatTemplateStrategy(PromptTokenizingStrategy):
-    """
-    Tokenizing strategy for instruction-based prompts.
-    """
-
-    def tokenize_prompt(self, prompt):
-        turns = self.get_conversation_thread(prompt)
-        prompt_ids = self.prompter.build_prompt([turns[0]], add_generation_prompt=True)
-        input_ids = self.prompter.build_prompt(turns)
-
-        if not self.train_on_inputs:
-            user_prompt_len = len(prompt_ids)
-            labels = [-100] * user_prompt_len + input_ids[user_prompt_len:]
-        else:
-            labels = input_ids
-
-        tokenized_prompt = {
-            "input_ids": input_ids,
-            "labels": labels,
-            "attention_mask": [1] * len(input_ids),
-        }
-
-        return tokenized_prompt
-
-    def get_conversation_thread(self, prompt):
-        conversations = prompt["conversations"]
-        # remap roles - allow for assistant turn
-        role_map = {
-            "human": "user",
-            "user": "user",
-            "assistant": "assistant",
-            "gpt": "assistant",
-        }
-        turns = [
-            {"role": role_map[t["from"]], "content": t["value"]} for t in conversations
-        ]
-        return turns
-
-
-def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
-    chat_template = (
-        ds_cfg["chat_template"] if ds_cfg and "chat_template" in ds_cfg else "chatml"
-    )
-    strategy = ChatTemplateStrategy(
-        ChatTemplatePrompter(tokenizer, chat_templates(chat_template)),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-    return strategy
--- a/src/axolotl/prompt_strategies/dpo/init.py
+++ b/src/axolotl/prompt_strategies/dpo/init.py
@@ -1,8 +1,21 @@
 """
 module for DPO style dataset transform strategies
 """
-from functools import partial

-from ..base import load as load_base
+import importlib
+import logging

-load = partial(load_base, module="axolotl.prompt_strategies.dpo")
+LOG = logging.getLogger("axolotl")
+
+
+def load(strategy, cfg):
+    try:
+        load_fn = strategy.split(".")[-1]
+        strategy = ".".join(strategy.split(".")[:-1])
+        mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies.dpo")
+        func = getattr(mod, load_fn)
+        load_kwargs = {}
+        return func(cfg, **load_kwargs)
+    except Exception:  # pylint: disable=broad-exception-caught
+        LOG.warning(f"unable to load strategy {strategy}")
+        return None
--- a/src/axolotl/prompt_strategies/dpo/chatml.py
+++ b/src/axolotl/prompt_strategies/dpo/chatml.py
@@ -5,7 +5,6 @@ DPO strategies for chatml

 def argilla(
    cfg,
-    **kwargs,
 ):  # pylint: disable=possibly-unused-variable,unused-argument
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
@@ -24,28 +23,8 @@ def argilla(
    return transform_fn


-def argilla_chat(
-    cfg,
-    **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
-    """
-    for argilla/dpo-mix-7k conversations
-    """
-
-    def transform_fn(sample):
-        sample[
-            "prompt"
-        ] = f"<|im_start|>user\n{sample['chosen'][0]['content']}<|im_end|>\n<|im_start|>assistant\n"
-        sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>"
-        sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>"
-        return sample
-
-    return transform_fn
-
-
 def icr(
    cfg,
-    **kwargs,
 ):  # pylint: disable=possibly-unused-variable,unused-argument
    """
    chatml transforms for datasets with system, input, chosen, rejected
@@ -69,7 +48,7 @@ def icr(
    return transform_fn


-def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def intel(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
    """
    For Intel Orca DPO Pairs
    """
@@ -91,9 +70,7 @@ def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-arg
    return transform_fn


-def prompt_pairs(
-    cfg, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
+def prompt_pairs(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
@@ -111,7 +88,7 @@ def prompt_pairs(
    return transform_fn


-def ultra(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def ultra(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
    """
    for ultrafeedback binarized conversations
    """
--- a/src/axolotl/prompt_strategies/dpo/user_defined.py
+++ b/src/axolotl/prompt_strategies/dpo/user_defined.py
@@ -1,41 +0,0 @@
-"""
-User-defined DPO strategies
-"""
-
-
-def default(cfg, dataset_idx=0, **kwargs):  # pylint: disable=unused-argument
-    ds_cfg = cfg["datasets"][dataset_idx]["type"]
-    if not isinstance(ds_cfg, dict):
-        raise ValueError(
-            f"User-defined dataset type must be a dictionary. Got: {ds_cfg}"
-        )
-    field_prompt = ds_cfg.get("field_prompt", "prompt")
-    field_system = ds_cfg.get("field_system", "system")
-    field_chosen = ds_cfg.get("field_chosen", "chosen")
-    field_rejected = ds_cfg.get("field_rejected", "rejected")
-    prompt_format = ds_cfg.get("prompt_format")
-    if not prompt_format:
-        prompt_format = "{" + field_prompt + "}"
-    chosen_format = ds_cfg.get("chosen_format")
-    if not chosen_format:
-        chosen_format = "{" + field_chosen + "}"
-    rejected_format = ds_cfg.get("rejected_format")
-    if not rejected_format:
-        rejected_format = "{" + field_rejected + "}"
-
-    def transform_fn(sample):
-        if (
-            "{" + field_system + "}" in prompt_format
-            and field_system in sample
-            and sample[field_system]
-        ):
-            sample["prompt"] = prompt_format.format(
-                system=sample[field_system], prompt=sample[field_prompt]
-            )
-        else:
-            sample["prompt"] = prompt_format.format(prompt=sample["prompt"])
-        sample["chosen"] = chosen_format.format(chosen=sample[field_chosen])
-        sample["rejected"] = rejected_format.format(rejected=sample[field_rejected])
-        return sample
-
-    return transform_fn
--- a/src/axolotl/prompt_strategies/dpo/zephyr.py
+++ b/src/axolotl/prompt_strategies/dpo/zephyr.py
@@ -3,7 +3,7 @@ DPO strategies for zephyr
 """


-def nectar(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def nectar(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
    def transform_fn(sample):
        data = {}
        data["prompt"] = (
--- a/src/axolotl/prompt_strategies/input_output.py
+++ b/src/axolotl/prompt_strategies/input_output.py
@@ -1,54 +0,0 @@
-"""Module for plain input/output prompt pairs"""
-from typing import Generator, Tuple
-
-from axolotl.prompt_tokenizers import PromptTokenizingStrategy
-from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
-
-
-class RawInputOutputStrategy(PromptTokenizingStrategy):
-    """Prompt Strategy class for input/output pairs"""
-
-    def __init__(self, *args, eos_token=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.eos_token = eos_token
-        if not eos_token:
-            self.eos_token = self.tokenizer.eos_token
-
-    def tokenize_prompt(self, prompt):
-        # pylint: disable=duplicate-code
-        input_ids = []
-        labels = []
-        for label, text in self.prompter.build_prompt(prompt["segments"]):
-            tokenized_output = self.tokenizer(
-                text, add_special_tokens=False, return_tensors=None
-            )["input_ids"]
-            input_ids += tokenized_output
-            if label or self.train_on_inputs:
-                labels += tokenized_output
-            else:
-                labels += [IGNORE_TOKEN_ID] * len(tokenized_output)
-
-        tokenized_prompt = {
-            "input_ids": input_ids,
-            "labels": labels,
-            "attention_mask": [1] * len(input_ids),
-        }
-
-        return tokenized_prompt
-
-
-class RawInputOutputPrompter(Prompter):
-    """prompter for raw i/o data"""
-
-    def build_prompt(self, source) -> Generator[Tuple[bool, str], None, None]:
-        for segment in source:
-            yield segment["label"], segment["text"]
-
-
-def load(tokenizer, cfg):
-    return RawInputOutputStrategy(
-        RawInputOutputPrompter(),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
--- a/src/axolotl/prompt_strategies/orpo/init.py
+++ b/src/axolotl/prompt_strategies/orpo/init.py
@@ -1,9 +0,0 @@
-"""
-module for ORPO style dataset transform strategies
-"""
-
-from functools import partial
-
-from ..base import load as load_base
-
-load = partial(load_base, module="axolotl.prompt_strategies.orpo")
--- a/src/axolotl/prompt_strategies/orpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/orpo/chat_template.py
@@ -1,187 +0,0 @@
-"""chatml prompt tokenization strategy for ORPO"""
-from typing import Any, Dict, Generator, List, Optional, Tuple
-
-from pydantic import BaseModel
-
-from axolotl.prompt_tokenizers import IGNORE_INDEX, PromptTokenizingStrategy
-from axolotl.prompters import Prompter
-from axolotl.utils.chat_templates import chat_templates
-
-
-class Message(BaseModel):
-    """message/turn"""
-
-    role: str
-    content: str
-    label: Optional[bool] = None
-
-
-class MessageList(BaseModel):
-    """conversation"""
-
-    messages: List[Message]
-
-
-def load(
-    tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
-    """
-    chatml transforms for datasets with system, input, chosen, rejected
-    """
-
-    chat_template = chat_templates("chatml")
-    if ds_cfg and "chat_template" in ds_cfg:
-        chat_template = ds_cfg["chat_template"]
-        try:
-            chat_template = chat_templates(chat_template)
-        except ValueError:
-            pass
-
-    return ORPOTokenizingStrategy(
-        ORPOPrompter(chat_template, tokenizer),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-        dataset_parser=ORPODatasetParsingStrategy(),
-    )
-
-
-class ORPODatasetParsingStrategy:
-    """Strategy to parse chosen rejected dataset into messagelist"""
-
-    def get_chosen_conversation_thread(self, prompt) -> MessageList:
-        """Dataset structure mappings"""
-
-        messages: List[Message] = []
-        if system := prompt.get("system", None):
-            messages.append(Message(role="system", content=system, label=False))
-        messages.append(Message(role="user", content=prompt["prompt"], label=False))
-        messages.append(
-            Message(
-                role="assistant", content=prompt["chosen"][1]["content"], label=True
-            )
-        )
-        return MessageList(messages=messages)
-
-    def get_rejected_conversation_thread(self, prompt) -> MessageList:
-        """Dataset structure mappings"""
-
-        messages: List[Message] = []
-        if system := prompt.get("system", None):
-            messages.append(Message(role="system", content=system, label=False))
-        messages.append(Message(role="user", content=prompt["prompt"], label=False))
-        messages.append(
-            Message(
-                role="assistant", content=prompt["rejected"][1]["content"], label=True
-            )
-        )
-        return MessageList(messages=messages)
-
-
-class ORPOTokenizingStrategy(PromptTokenizingStrategy):
-    """
-    rejected_input_ids
-    input_ids
-    rejected_attention_mask
-    attention_mask
-    rejected_labels
-    labels
-    """
-
-    def __init__(
-        self,
-        *args,
-        dataset_parser=None,
-        **kwargs,
-    ):
-        super().__init__(*args, **kwargs)
-        self.dataset_parser = dataset_parser
-
-    def tokenize_prompt(self, prompt):
-        # pass the rejected prompt/row to the Prompter to get the formatted prompt
-        prompt_len = 0
-        rejected_message_list = self.dataset_parser.get_rejected_conversation_thread(
-            prompt
-        )
-        input_ids = []
-        labels = []
-        for _, (part, label) in enumerate(
-            self.prompter.build_prompt(rejected_message_list)
-        ):
-            if not part:
-                continue
-            _input_ids = self.tokenizer.encode(part, add_special_tokens=False)
-            prev_idx = len(input_ids)
-            input_ids += _input_ids[prev_idx:]
-            if label:
-                labels += input_ids[prev_idx:]
-            else:
-                labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)
-                prompt_len = len(input_ids)
-        # remap the input_ids, attention_mask and labels
-        rejected_input_ids = input_ids
-        rejected_labels = labels
-        # pass the chosen prompt/row to the Prompter to get the formatted prompt
-        chosen_message_list = self.dataset_parser.get_chosen_conversation_thread(prompt)
-        input_ids = []
-        labels = []
-        for _, (part, label) in enumerate(
-            self.prompter.build_prompt(chosen_message_list)
-        ):
-            if not part:
-                continue
-            _input_ids = self.tokenizer.encode(part, add_special_tokens=False)
-            prev_idx = len(input_ids)
-            input_ids += _input_ids[prev_idx:]
-            if label:
-                labels += input_ids[prev_idx:]
-            else:
-                labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)
-
-        return {
-            "rejected_input_ids": rejected_input_ids,
-            "rejected_labels": rejected_labels,
-            "rejected_attention_mask": [1] * len(rejected_labels),
-            "input_ids": input_ids,
-            "labels": labels,
-            "attention_mask": [1] * len(labels),
-            "prompt_attention_mask": [1] * prompt_len
-            + [0] * (len(labels) - prompt_len),
-        }
-
-
-class ORPOPrompter(Prompter):
-    """Single Turn prompter for ORPO"""
-
-    def __init__(self, chat_template, tokenizer):
-        self.chat_template = chat_template
-        self.tokenizer = tokenizer
-
-    def build_prompt(
-        self,
-        message_list: MessageList,
-    ) -> Generator[Tuple[str, bool], None, None]:
-        conversation = []
-        for message in message_list.messages:
-            conversation.append(message.model_dump())
-            if message.role == "system":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=False,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), False
-            if message.role == "user":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=True,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), False
-            if message.role == "assistant":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=False,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), True
--- a/src/axolotl/prompt_strategies/sharegpt.py
+++ b/src/axolotl/prompt_strategies/sharegpt.py
@@ -1,18 +1,10 @@
 """Module containing the SimpleShareGPTPromptTokenizingStrategy class"""
-
-import logging
 from typing import Any, Dict, Optional

 from fastchat.conversation import Conversation, SeparatorStyle, register_conv_template

 from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
 from axolotl.prompters import ShareGPTPrompterV2
-from axolotl.utils.tokenization import (
-    chatml_to_conversation,
-    merge_consecutive_messages,
-)
-
-LOG = logging.getLogger("axolotl")


 def register_chatml_template(system_message=None):
@@ -27,16 +19,6 @@ def register_chatml_template(system_message=None):
            sep="<|im_end|>",
        )
    )
-    register_conv_template(
-        Conversation(
-            name="chatml_glaive",
-            system_template="<|im_start|>system\n{system_message}",
-            system_message=system_message,
-            roles=["<|im_start|>user", "<|im_start|>assistant", "<|im_start|>tool"],
-            sep_style=SeparatorStyle.CHATML,
-            sep="<|im_end|>",
-        )
-    )


 def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
@@ -45,13 +27,11 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
    )
    field_human = ds_cfg["field_human"] if ds_cfg and "field_human" in ds_cfg else None
    field_model = ds_cfg["field_model"] if ds_cfg and "field_model" in ds_cfg else None
-    roles = ds_cfg["roles"].to_dict() if ds_cfg and "roles" in ds_cfg else None
    strategy = SimpleShareGPTPromptTokenizingStrategy(
        ShareGPTPrompterV2(
            conversation=conversation,
            role_key_model=field_model,
            role_key_human=field_human,
-            roles=roles,
        ),
        tokenizer,
        cfg.train_on_inputs,
@@ -97,26 +77,12 @@ def load_guanaco(tokenizer, cfg):
    )


-def load_glaive(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
-    conversation = (
-        ds_cfg["conversation"]
-        if ds_cfg and "conversation" in ds_cfg
-        else "chatml_glaive"
-    )
-    return GlaiveShareGPTPromptTokenizingStrategy(
-        ShareGPTPrompterV2(conversation=conversation),
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-
-
 class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
    """
    basic sharegpt strategy to grab conversations from the sample row
    """

-    _strict = False
+    _strict = True

    @property
    def strict(self):
@@ -130,30 +96,10 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
        conversations = prompt["conversations"]
        if self.strict:
            return conversations
-        role_key = "from"
-        if "role" in conversations[0].keys():
-            role_key = "role"
-        value_key = "value"
-        if "text" in conversations[0].keys():
-            value_key = "text"
-        elif "content" in conversations[0].keys():
-            value_key = "content"
-        # remap roles - allow for assistant turn"
-        role_map = {
-            "user": "human",
-            "human": "human",
-            "assistant": "gpt",
-            "gpt": "gpt",
-            "system": "system",
-        }
+        # remap roles - allow for assistant turn
+        role_map = {"human": "human", "assistant": "gpt", "gpt": "gpt"}
        turns = [
-            {
-                "from": (
-                    role_map[t[role_key]] if t[role_key] in role_map else t[role_key]
-                ),
-                "value": t[value_key],
-            }
-            for t in conversations
+            {"from": role_map[t["from"]], "value": t["value"]} for t in conversations
        ]
        return turns

@@ -197,15 +143,3 @@ class UltrachatShareGPTPromptTokenizingStrategy(SimpleShareGPTPromptTokenizingSt
            {"from": role_map[t["role"]], "value": t["content"]} for t in conversations
        ]
        return turns
-
-
-class GlaiveShareGPTPromptTokenizingStrategy(SimpleShareGPTPromptTokenizingStrategy):
-    """
-    sharegpt strategy that remaps glaive data to sharegpt format
-    """
-
-    def get_conversation_thread(self, prompt):
-        conversation = chatml_to_conversation(prompt)
-        conversation = merge_consecutive_messages(conversation)
-
-        return conversation
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -11,7 +11,7 @@ from transformers import BatchEncoding, PreTrainedTokenizer
 from axolotl.monkeypatch.fastchat_conversation_turns import (
    add_get_turns_to_conversation,
 )
-from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
+from axolotl.prompters import IGNORE_TOKEN_ID

 LOG = logging.getLogger("axolotl")

@@ -37,7 +37,7 @@ class PromptTokenizingStrategy(abc.ABC):

    def __init__(
        self,
-        prompter: Prompter,
+        prompter,
        tokenizer,
        train_on_inputs: bool = False,
        sequence_len: int = 2048,
@@ -340,23 +340,6 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
            self.prompter._conversation.copy()  # pylint: disable=protected-access
        )

-        input_roles = {conversation.roles[0]}
-        output_roles = {conversation.roles[1]}
-
-        if len(conversation.roles) == 3:
-            tool_role_label = conversation.roles[2]
-            input_roles.add(tool_role_label)
-
-        # Add roles from the config
-        if self.prompter.roles:
-            if "input" in self.prompter.roles and self.prompter.roles["input"]:
-                for role in self.prompter.roles["input"]:
-                    input_roles.add(role)
-
-            if "output" in self.prompter.roles and self.prompter.roles["output"]:
-                for role in self.prompter.roles["output"]:
-                    output_roles.add(role)
-
        # support for custom roles from the dataset, only useful for vicuna style prompts/roles
        role_remap = []
        if (
@@ -377,18 +360,11 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    LOG.warning(f"expected tuple, got {part}")
                    continue

+                user, assistant = conversation.roles
                role, content = part

                # Uses "in" because role contains extra characters
-                input_turn = any(r.lower() in role.lower() for r in input_roles)
-                output_turn = any(r.lower() in role.lower() for r in output_roles)
-                empty_role = role.strip() == ""
-
-                if not any([input_turn, output_turn, empty_role]):
-                    LOG.warning(f"unhandled role: {role}")
-                    continue
-
-                if input_turn:
+                if user in role:
                    role = (
                        role.replace(role_remap[0]["from"], role_remap[0]["to"])
                        if role_remap
@@ -408,7 +384,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    else:
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
-                elif output_turn:
+                elif assistant in role:
                    role = (
                        role.replace(role_remap[1]["from"], role_remap[1]["to"])
                        if role_remap
@@ -439,7 +415,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                        labels[:len_role] = [IGNORE_TOKEN_ID] * min(
                            len_role, len(labels)
                        )
-                elif empty_role:
+                elif role == "":
                    turn = content
                    # this is only ever the first part, should include the bos token and the user query
                    res = self._tokenize(
@@ -450,6 +426,9 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    else:
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                else:
+                    LOG.warning(f"unhandled role: {role}")
+                    continue

                # pylint: disable=duplicate-code
                result, current_len = parse_tokenized_to_result(
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -259,12 +259,6 @@ SHAREGPT_ASSERTION_FAILED_ROLE = (
    "Role did not alternate between turns (gpt and human). Please check your data."
 )

-CONVERSATION_ROLE_FORMAT = {
-    "chatml": "<|im_start|>{ROLE}",
-    "zephyr": "<|{ROLE}|>",
-    "vicuna_v1.1": "{ROLE}",
-}
-

 class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
    """
@@ -273,10 +267,6 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods

    role_key_human = "human"
    role_key_model = "gpt"
-    # Optional, only used for tool usage datasets.
-    role_key_tool: Optional[str] = None
-    # Optional, role input/output mapping
-    roles: Optional[dict] = None

    def __init__(
        self,
@@ -284,8 +274,6 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
        conversation: Optional[Union[str, Conversation]] = None,
        role_key_human: Optional[str] = None,
        role_key_model: Optional[str] = None,
-        role_key_tool: Optional[str] = None,
-        roles: Optional[dict] = None,
    ):
        if conversation:
            if isinstance(conversation, Conversation):
@@ -298,10 +286,6 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
            self.role_key_human = role_key_human
        if role_key_model:
            self.role_key_model = role_key_model
-        if role_key_tool:
-            self.role_key_tool = role_key_tool
-        if roles:
-            self.roles = roles

    def _build_result(self, source):
        if len(source) < 2:
@@ -319,8 +303,6 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
            source.pop(0)

        roles = {self.role_key_human: conv.roles[0], self.role_key_model: conv.roles[1]}
-        if self.role_key_tool:
-            roles[self.role_key_tool] = conv.roles[2]

        try:
            # Apply prompt templates
@@ -333,23 +315,11 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods

        conv.messages = []
        for _, sentence in enumerate(source):
-            from_role = sentence["from"]
-            if from_role in roles:
-                role = roles[from_role]
-            else:
-                if self._conversation.name not in CONVERSATION_ROLE_FORMAT:
-                    raise NotImplementedError(
-                        f"Role ({role}) not in default roles, and {self._conversation.name} does not support role remapping yet."
-                        "Please help us by creating an Issue to add support for this conversation type."
-                    )
-
-                role = CONVERSATION_ROLE_FORMAT[self._conversation.name].format(
-                    ROLE=from_role
-                )
-
-            if len(conv.messages) > 0 and ((role == conv.messages[-1][0])):
+            role = roles[sentence["from"]]
+            if len(conv.messages) > 0 and (
+                (role == conv.messages[-1][0]) or (role not in conv.roles)
+            ):
                LOG.warning(f"{SHAREGPT_ASSERTION_FAILED_ROLE}: {sentence}")
-
            conv.append_message(role, sentence["value"])

        return conv.get_turns()
@@ -377,13 +347,11 @@ class ShareGPTPrompterV2(ShareGPTPrompter):
        conversation: Optional[Union[str, Conversation]] = None,
        role_key_human: Optional[str] = None,
        role_key_model: Optional[str] = None,
-        roles: Optional[dict] = None,
    ):
        super().__init__(
            conversation=conversation,
            role_key_human=role_key_human,
            role_key_model=role_key_model,
-            roles=roles,
        )


--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -19,7 +19,7 @@ from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.freeze import freeze_layers_except
+from axolotl.utils.freeze import freeze_parameters_except
 from axolotl.utils.models import load_model, load_tokenizer
 from axolotl.utils.trainer import setup_trainer

@@ -85,7 +85,7 @@ def train(
    model.generation_config.do_sample = True

    model_ref = None
-    if cfg.rl and cfg.rl != "orpo":
+    if cfg.rl:
        if cfg.adapter and not cfg.rl_adapter_ref_model:
            # use built-in trl autounwrap
            LOG.debug("Passing model_ref: None to RL trainer")
@@ -99,7 +99,7 @@ def train(
    safe_serialization = cfg.save_safetensors is True

    if cfg.unfrozen_parameters:
-        freeze_layers_except(model, cfg.unfrozen_parameters)
+        freeze_parameters_except(model, cfg.unfrozen_parameters)

    trainer = setup_trainer(
        cfg,
@@ -110,6 +110,9 @@ def train(
        total_num_steps,
    )

+    if hasattr(model, "config"):
+        model.config.use_cache = False
+
    # go ahead and presave, so we have the adapter config available to inspect
    if peft_config:
        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
--- a/src/axolotl/utils/bench.py
+++ b/src/axolotl/utils/bench.py
@@ -24,9 +24,9 @@ def check_cuda_device(default_value):
                or not torch.cuda.is_available()
                or device == "auto"
                or torch.device(device).type == "cpu"
-                or torch.device(device).type == "meta"
            ):
                return default_value
+
            return func(*args, **kwargs)

        return wrapper
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -9,6 +9,7 @@ from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Dict, List

 import evaluate
+import mlflow
 import numpy as np
 import pandas as pd
 import torch
@@ -41,8 +42,8 @@ from axolotl.utils.distributed import (
 if TYPE_CHECKING:
    from axolotl.core.trainer_builder import AxolotlTrainingArguments

-IGNORE_INDEX = -100
 LOG = logging.getLogger("axolotl.callbacks")
+IGNORE_INDEX = -100


 class EvalFirstStepCallback(
@@ -61,6 +62,7 @@ class EvalFirstStepCallback(
    ):
        if (
            args.evaluation_strategy == IntervalStrategy.STEPS
+            and args.eval_steps < 1.0
            and state.global_step == 1
        ):
            control.should_evaluate = True
@@ -359,187 +361,6 @@ def bench_eval_callback_factory(trainer, tokenizer):
    return BenchEvalCallback


-def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
-    class CausalLMBenchEvalCallback(TrainerCallback):
-        """Callback to log prediction values during each evaluation"""
-
-        def __init__(self, cfg):
-            self.cfg = cfg
-            self.logged = False
-            self.metrics = self.__maybe_load_metrics()
-
-        def __maybe_load_metrics(self):
-            metrics = {}
-            for metric in self.cfg.eval_causal_lm_metrics:
-                try:
-                    metrics[metric] = evaluate.load(metric)
-                except Exception as exc:  # pylint: disable=broad-exception-caught
-                    LOG.warning(f"{metric}: {exc.args}")
-            return metrics
-
-        def on_evaluate(
-            self,
-            args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
-            state: TrainerState,
-            control: TrainerControl,
-            train_dataloader,  # pylint: disable=unused-argument
-            eval_dataloader,
-            **kwargs,  # pylint: disable=unused-argument
-        ):
-            trainer.model.eval()
-            device = torch.device(self.cfg.device)
-
-            # pylint: disable=duplicate-code
-            generation_config = GenerationConfig(
-                max_new_tokens=self.cfg.eval_max_new_tokens,
-                bos_token_id=tokenizer.bos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                pad_token_id=tokenizer.pad_token_id,
-                do_sample=False,
-                use_cache=True,
-                return_dict_in_generate=True,
-                output_attentions=False,
-                output_hidden_states=False,
-                output_scores=False,
-            )
-
-            def find_ranges(lst):
-                ranges = []
-                start = 0
-                for i in range(1, len(lst)):
-                    if lst[i] == 0:
-                        ranges.append((start, i - 1))
-                        start = i
-                end = len(lst) - 1
-                ranges.append((start, end))
-                return ranges
-
-            def compute(metric: evaluate.Metric, **kwargs):
-                # safely compute a metric and return the score if the format is correct
-                metric_score = None
-                try:
-                    metric_score = metric.compute(**kwargs)
-                    return (
-                        metric_score["score"]
-                        if "score" in metric_score
-                        else metric_score["mean_score"]
-                    )
-                except Exception:  # pylint: disable=broad-exception-caught
-                    LOG.debug(
-                        f"Failed to compute metric {metric.name} with kwargs {kwargs.keys()}"
-                    )
-                return metric_score
-
-            def evaluate_preds(sources, predictions, references):
-                scores = {}
-
-                for metric_name, metric in self.metrics.items():
-                    score = compute(
-                        metric,
-                        references=references,
-                        predictions=predictions,
-                        sources=sources,
-                    )
-                    score = score or compute(
-                        metric,
-                        references=[[r] for r in references],
-                        predictions=predictions,
-                    )
-                    scores[metric_name] = score
-                return scores
-
-            def predict_with_generate():
-                eval_src, eval_pred, eval_ref = [], [], []
-
-                for batch in tqdm(eval_dataloader):
-                    batch_labels = batch["labels"].to(device)
-                    batch_input_ids = batch["input_ids"].to(device)
-
-                    if "position_ids" in batch:
-                        batch_pos_ids = batch["position_ids"].tolist()
-                    else:
-                        batch_pos_ids = [None] * len(batch["input_ids"])
-
-                    prompt_token_ids_list = []
-                    completion_token_ids_list = []
-
-                    for input_ids_all, labels_all, pos_ids in zip(
-                        batch_input_ids,
-                        batch_labels,
-                        batch_pos_ids,
-                    ):
-                        if pos_ids is None:
-                            pos_ranges = [(0, len(input_ids_all) - 1)]
-                        else:
-                            pos_ranges = find_ranges(pos_ids)
-
-                        for pos_range in pos_ranges:
-                            start, end = pos_range
-                            if start == end:
-                                continue
-
-                            input_ids = input_ids_all[start : end + 1]
-                            labels = labels_all[start : end + 1]
-
-                            tokens_without_loss = labels == IGNORE_INDEX
-                            tokens_with_loss = labels != IGNORE_INDEX
-                            tokens_exclude_padding = input_ids != tokenizer.pad_token_id
-                            prompt_token_includes = (
-                                tokens_without_loss & tokens_exclude_padding
-                            )
-
-                            prompt_token_ids = input_ids[prompt_token_includes]
-                            prompt_token_ids_list.append(prompt_token_ids)
-
-                            completion_token_ids = input_ids[tokens_with_loss]
-                            completion_token_ids_list.append(completion_token_ids)
-
-                    prompt_texts = tokenizer.batch_decode(
-                        prompt_token_ids_list, skip_special_tokens=True
-                    )
-                    completion_texts = tokenizer.batch_decode(
-                        completion_token_ids_list, skip_special_tokens=True
-                    )
-
-                    with torch.no_grad():
-                        prompt_encoding = tokenizer(
-                            prompt_texts, padding=True, return_tensors="pt"
-                        ).to(self.cfg.device)
-                        predictions = trainer.model.generate(
-                            **prompt_encoding, generation_config=generation_config
-                        )
-
-                    prediction_all_tokens = predictions["sequences"].cpu().tolist()
-                    prediction_without_prompt_tokens_list = []
-                    for prompt_token_ids, prediction_tokens in zip(
-                        prompt_token_ids_list, prediction_all_tokens
-                    ):
-                        prediction_without_prompt_tokens = prediction_tokens[
-                            len(prompt_token_ids) :
-                        ]
-                        prediction_without_prompt_tokens_list.append(
-                            prediction_without_prompt_tokens
-                        )
-
-                    predicted_texts = tokenizer.batch_decode(
-                        prediction_without_prompt_tokens_list, skip_special_tokens=True
-                    )
-
-                    eval_src.extend(prompt_texts)
-                    eval_pred.extend(predicted_texts)
-                    eval_ref.extend(completion_texts)
-
-                return eval_src, eval_pred, eval_ref
-
-            if is_main_process():
-                eval_preds = predict_with_generate()
-                trainer.log(evaluate_preds(*eval_preds))
-
-            return control
-
-    return CausalLMBenchEvalCallback
-
-
 def log_prediction_callback_factory(trainer: Trainer, tokenizer):
    class LogPredictionCallback(TrainerCallback):
        """Callback to log prediction values during each evaluation"""
@@ -567,7 +388,7 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer):

            # pylint: disable=duplicate-code
            generation_config = GenerationConfig(
-                max_new_tokens=self.cfg.eval_max_new_tokens,
+                max_new_tokens=self.cfg.eval_table_max_new_tokens,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
@@ -755,3 +576,31 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
        return control
+
+
+class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
+    """Callback to save axolotl config to mlflow"""
+
+    def __init__(self, axolotl_config_path):
+        self.axolotl_config_path = axolotl_config_path
+
+    def on_train_begin(
+        self,
+        args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
+        state: TrainerState,  # pylint: disable=unused-argument
+        control: TrainerControl,
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        if is_main_process():
+            try:
+                with NamedTemporaryFile(
+                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
+                ) as temp_file:
+                    copyfile(self.axolotl_config_path, temp_file.name)
+                    mlflow.log_artifact(temp_file.name, artifact_path="")
+                    LOG.info(
+                        "The Axolotl config has been saved to the MLflow artifacts."
+                    )
+            except (FileNotFoundError, ConnectionError) as err:
+                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
+        return control
--- a/src/axolotl/utils/callbacks/mlflow_.py
+++ b/src/axolotl/utils/callbacks/mlflow_.py
@@ -1,44 +0,0 @@
-"""MLFlow module for trainer callbacks"""
-import logging
-from shutil import copyfile
-from tempfile import NamedTemporaryFile
-from typing import TYPE_CHECKING
-
-import mlflow
-from transformers import TrainerCallback, TrainerControl, TrainerState
-
-from axolotl.utils.distributed import is_main_process
-
-if TYPE_CHECKING:
-    from axolotl.core.trainer_builder import AxolotlTrainingArguments
-
-LOG = logging.getLogger("axolotl.callbacks")
-
-
-class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
-    # pylint: disable=duplicate-code
-    """Callback to save axolotl config to mlflow"""
-
-    def __init__(self, axolotl_config_path):
-        self.axolotl_config_path = axolotl_config_path
-
-    def on_train_begin(
-        self,
-        args: "AxolotlTrainingArguments",  # pylint: disable=unused-argument
-        state: TrainerState,  # pylint: disable=unused-argument
-        control: TrainerControl,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        if is_main_process():
-            try:
-                with NamedTemporaryFile(
-                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
-                ) as temp_file:
-                    copyfile(self.axolotl_config_path, temp_file.name)
-                    mlflow.log_artifact(temp_file.name, artifact_path="")
-                    LOG.info(
-                        "The Axolotl config has been saved to the MLflow artifacts."
-                    )
-            except (FileNotFoundError, ConnectionError) as err:
-                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
-        return control
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -21,8 +21,7 @@ def chat_templates(user_choice: str):
    templates = {
        "alpaca": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
        "inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # I don't know what this one is called. Used by Mistral/Mixtral.
-        "chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
-        "gemma": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
+        "chatml": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
    }

    if user_choice in templates:
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -3,16 +3,11 @@ import json
 import logging
 import os
 from pathlib import Path
-from typing import Optional

 import torch
 from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.utils.bench import log_gpu_memory_usage
-from axolotl.utils.config.models.input.v0_4_1 import (
-    AxolotlConfigWCapabilities,
-    AxolotlInputConfig,
-)
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model_config

@@ -61,13 +56,7 @@ def normalize_config(cfg):
    cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
    cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
    cfg.eval_table_size = cfg.eval_table_size or 0
-    cfg.eval_max_new_tokens = cfg.eval_max_new_tokens or 128
-    cfg.eval_causal_lm_metrics = cfg.eval_causal_lm_metrics or [
-        "sacrebleu",
-        "comet",
-        "ter",
-        "chrf",
-    ]
+    cfg.eval_table_max_new_tokens = cfg.eval_table_max_new_tokens or 128
    choose_device(cfg)
    cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
    if cfg.ddp:
@@ -124,7 +113,7 @@ def normalize_config(cfg):
        (hasattr(model_config, "model_type") and model_config.model_type == "llama")
        or cfg.is_llama_derived_model
        or "llama" in cfg.base_model.lower()
-        or (cfg.type_of_model and "llama" in cfg.type_of_model.lower())
+        or (cfg.model_type and "llama" in cfg.model_type.lower())
    )

    # figure out if the model is falcon
@@ -140,7 +129,7 @@ def normalize_config(cfg):
        )
        or cfg.is_falcon_derived_model
        or "falcon" in cfg.base_model.lower()
-        or (cfg.type_of_model and "rwforcausallm" in cfg.type_of_model.lower())
+        or (cfg.model_type and "rwforcausallm" in cfg.model_type.lower())
    )

    cfg.is_mistral_derived_model = (
@@ -153,7 +142,7 @@ def normalize_config(cfg):
        )
        or cfg.is_mistral_derived_model
        or "mistral" in cfg.base_model.lower().split("/")[-1]
-        or (cfg.type_of_model and "mistral" in cfg.type_of_model.lower())
+        or (cfg.model_type and "mistral" in cfg.model_type.lower())
    )

    cfg.is_qwen_derived_model = (
@@ -164,6 +153,9 @@ def normalize_config(cfg):
        ]
    ) or cfg.is_qwen_derived_model

+    if isinstance(cfg.learning_rate, str):
+        cfg.learning_rate = float(cfg.learning_rate)
+
    if isinstance(cfg.pretraining_dataset, dict):
        cfg.pretraining_dataset = [cfg.pretraining_dataset]

@@ -191,28 +183,9 @@ def normalize_cfg_datasets(cfg):
                        f"updating dataset {ds_cfg.path} with `conversation: chatml` to match your chat_template"
                    )
                    cfg.datasets[idx].conversation = "chatml"
-                if ds_cfg.type == "orpo.chat_template" and not ds_cfg.chat_template:
-                    LOG.info(
-                        f"updating dataset {ds_cfg.path} with `chat_template: chatml` to match your chat_template"
-                    )
-                    cfg.datasets[idx].chat_template = "chatml"


-def validate_config(cfg: DictDefault, capabilities: Optional[dict] = None):
-    if capabilities:
-        return DictDefault(
-            dict(
-                AxolotlConfigWCapabilities(
-                    **cfg.to_dict(), capabilities=capabilities
-                ).model_dump(exclude_unset=True)
-            )
-        )
-    return DictDefault(
-        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_unset=True))
-    )
-
-
-def legacy_validate_config(cfg):
+def validate_config(cfg):
    """
    This is a "pre-validation" step that handles the yaml configuration before we have any
    information about the model architecture
@@ -384,11 +357,11 @@ def legacy_validate_config(cfg):
            "hub_model_id is set without any models being saved. To save a model, set either save_steps or saves_per_epoch."
        )

-    if cfg.gptq and cfg.revision_of_model:
+    if cfg.gptq and cfg.model_revision:
        raise ValueError(
-            "revision_of_model is not supported for GPTQ models. "
+            "model_revision is not supported for GPTQ models. "
            + "Please download the model from HuggingFace Hub manually for correct branch, "
-            + "point to its path, and remove revision_of_model from the config."
+            + "point to its path, and remove model_revision from the config."
        )

    # if cfg.sample_packing and cfg.sdp_attention:
@@ -501,6 +474,9 @@ def legacy_validate_config(cfg):
    if cfg.rope_scaling:
        LOG.warning("`rope_scaling` should now be be a key under `model_config`")

+    if cfg.warmup_steps and cfg.warmup_ratio:
+        raise ValueError("warmup_steps and warmup_ratio are mutually exclusive")
+
    if cfg.wandb_run_id and not cfg.wandb_name:
        cfg.wandb_name = cfg.wandb_run_id

@@ -574,21 +550,6 @@ def legacy_validate_config(cfg):
    if cfg.fsdp and "bnb" in cfg.optimizer:
        raise ValueError(f"FSDP not compatible with {cfg.optimizer}")

-    if cfg.do_causal_lm_eval and cfg.eval_sample_packing:
-        raise ValueError(
-            "do_causal_lm_eval is enabled, eval_sample_packing must be set to False"
-        )
-
-    if cfg.eval_causal_lm_metrics:
-        supported_metrics = ["sacrebleu", "comet", "ter", "chrf"]
-        if not isinstance(cfg.eval_causal_lm_metrics, list):
-            raise ValueError("eval_causal_lm_metrics must be a list")
-        # only ["sacrebleu", "comet", "ter", "chrf"] supported
-        if set(cfg.eval_causal_lm_metrics) - set(supported_metrics):
-            raise ValueError(
-                f"eval_causal_lm_metrics must be one of {supported_metrics}"
-            )
-
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/src/axolotl/utils/config/models/input/init.py
+++ b/src/axolotl/utils/config/models/input/init.py
--- a/src/axolotl/utils/config/models/input/next/init.py
+++ b/src/axolotl/utils/config/models/input/next/init.py
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
--- a/Show More
+++ b/Show More