make lisa training example work on one 24gb gpu

example config for lisa
fix LISA by ensuring params are not frozen during __init__
2024-04-02 03:19:54 +00:00 · 2024-04-01 07:27:16 +00:00 · 2024-04-01 06:57:28 +00:00 · 2024-04-01 04:54:03 +00:00 · 2024-03-31 00:27:04 -04:00 · 2024-03-30 22:55:15 -04:00
204 changed files with 10684 additions and 4548 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,6 +1,6 @@
 # These are supported funding model platforms

-github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+github: [winglian, OpenAccess-AI-Collective] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 patreon: # Replace with a single Patreon username
 open_collective: # Replace with a single Open Collective username
 ko_fi: axolotl_ai # Replace with a single Ko-fi username
--- a/.github/ISSUE_TEMPLATE/bug-report.yaml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yaml
@@ -59,6 +59,7 @@ body:
      label: Config yaml
      description: |
        Please attach the config yaml!
+      render: yaml

  - type: textarea
    id: possible-solution
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -1,40 +1,37 @@
 name: ci-cd-base

 on:
-  push:
-    branches:
-      - "main-base"
-      - "dev-base"
+  workflow_dispatch:

 jobs:
  build-base:
    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: self-hosted
+    runs-on: axolotl-gpu-runner
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: "118"
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
-          - cuda: "118"
-            cuda_version: 11.8.0
-            python_version: "3.10"
-            pytorch: 2.1.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            pytorch: 2.1.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            pytorch: 2.1.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.1.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -56,7 +53,7 @@ jobs:
          context: .
          file: ./docker/Dockerfile-base
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,31 @@
+name: Publish Docs
+on:
+  push:
+    branches:
+      - main
+
+permissions:
+    contents: write
+    pages: write
+
+jobs:
+    build-deploy:
+        runs-on: ubuntu-latest
+        steps:
+        - name: Check out repository
+          uses: actions/checkout@v4
+        - name: Set up Quarto
+          uses: quarto-dev/quarto-actions/setup@v2
+        - name: Setup Python
+          uses: actions/setup-python@v3
+          with:
+            python-version: '3.10'
+        - name: install dependencies
+          run: |
+            python3 -m pip install jupyter
+        - name: Publish to GitHub Pages (and render)
+          uses: quarto-dev/quarto-actions/publish@v2
+          with:
+            target: gh-pages
+          env:
+            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -17,6 +17,6 @@ jobs:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
-          python-version: "3.9"
+          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.0
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,37 +4,33 @@ on:
  push:
    branches:
      - "main"
+  workflow_dispatch:

 jobs:
  build-axolotl:
-    if: github.repository_owner == 'OpenAccess-AI-Collective'
-    # this job needs to be run on self-hosted GPU runners...
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            axolotl_extras:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
+            pytorch: 2.1.2
            axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
            is_latest: true
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.10"
-            pytorch: 2.1.1
-            axolotl_extras:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
            axolotl_extras:
-    runs-on: [self-hosted, gpu, docker]
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -55,57 +51,42 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
-          load: true
          build-args: |
            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
+            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
          file: ./docker/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
-      - name: Unit Tests
-        run: |
-          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-      - name: Push to Docker Hub
-        if: github.event_name != 'pull_request'
-        run: |
-          docker push ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          latest_tag=${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          if [ -n "$latest_tag" ]; then
-            docker push "$latest_tag"
-          fi

-  build-axolotl-runpod:
+  build-axolotl-cloud:
    needs: build-axolotl
-    if: github.repository_owner == 'OpenAccess-AI-Collective'
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
        include:
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            axolotl_extras:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
+            pytorch: 2.1.2
            axolotl_extras:
            is_latest: true
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.10"
-            pytorch: 2.1.1
-            axolotl_extras:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
            axolotl_extras:
-    runs-on: [self-hosted, gpu, docker]
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -132,7 +113,5 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-             winglian/axolotl-runpod:main-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-             ${{ (matrix.is_latest) && format('{0}-latest', 'winglian/axolotl-runpod:main') || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -0,0 +1,118 @@
+name: docker-nightlies
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
+
+jobs:
+  build-axolotl:
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
+            is_latest: true
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: winglian/axolotl
+          tags: |
+            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
+      - name: Build and export to Docker
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
+            CUDA=${{ matrix.cuda }}
+            PYTORCH_VERSION=${{ matrix.pytorch }}
+            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
+          file: ./docker/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          labels: ${{ steps.metadata.outputs.labels }}
+
+  build-axolotl-cloud:
+    needs: build-axolotl
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
+    # this job needs to be run on self-hosted GPU runners...
+    strategy:
+      matrix:
+        include:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+            is_latest: true
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: winglian/axolotl-cloud
+          tags: |
+            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Build
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            CUDA=${{ matrix.cuda }}
+          file: ./docker/Dockerfile-cloud
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -25,7 +25,7 @@ jobs:

      - name: Install dependencies
        run: |
-          pip3 install wheel
+          pip3 install wheel packaging
          pip3 install -e .
          pip3 install -r requirements-tests.txt

--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -23,7 +23,7 @@ jobs:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
-          python-version: "3.9"
+          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.0

@@ -33,8 +33,8 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.9", "3.10", "3.11"]
-    timeout-minutes: 10
+        python_version: ["3.10", "3.11"]
+    timeout-minutes: 20

    steps:
      - name: Check out repository code
@@ -48,6 +48,8 @@ jobs:

      - name: Install dependencies
        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging
          pip3 install -U -e .
          pip3 install -r requirements-tests.txt

@@ -58,8 +60,8 @@ jobs:
  docker-e2e-tests:
    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, gpu, docker]
-    timeout-minutes: 30
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 60
    needs: [pre-commit, pytest]

    strategy:
@@ -69,40 +71,37 @@ jobs:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
+            pytorch: 2.1.2
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
+            num_gpus: 1
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
+            num_gpus: 1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
+            num_gpus: 1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
+      - name: Install Python
+        uses: actions/setup-python@v5
        with:
-          images: winglian/axolotl-tests
-      - name: Build Docker image
+          python-version: "3.10"
+      - name: Install Modal
        run: |
-          # Set up build arguments
-          BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
-          CUDA="${{ matrix.cuda }}"
-          PYTORCH_VERSION="${{ matrix.pytorch }}"
-          # Build the Docker image
-          docker build . \
-            --file ./docker/Dockerfile-tests \
-            --build-arg BASE_TAG=$BASE_TAG \
-            --build-arg CUDA=$CUDA \
-            --build-arg GITHUB_REF=$GITHUB_REF \
-            --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
-            --tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
-            --no-cache
-      - name: Unit Tests w docker image
+          python -m pip install --upgrade pip
+          pip install modal jinja2
+      - name: Update env vars
        run: |
-          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-      - name: GPU Unit Tests w docker image
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
        run: |
-          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
-      - name: GPU Unit Tests monkeypatched w docker image
-        run: |
-          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
+          modal run cicd.tests
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 configs
 last_run_prepared/
 .vscode
+_site/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -167,3 +168,10 @@ cython_debug/
 # WandB
 # wandb creates a folder to store logs for training runs
 wandb
+
+# Runs
+lora-out/*
+qlora-out/*
+mlruns/*
+
+/.quarto/
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -1,5 +1,5 @@
 [mypy]
-
+plugins = pydantic.mypy
 exclude = venv

 [mypy-alpaca_lora_4bit.*]
@@ -32,6 +32,9 @@ ignore_missing_imports = True
 [mypy-bitsandbytes]
 ignore_missing_imports = True

+[mypy-requests]
+ignore_missing_imports = True
+
 [mypy-datasets]
 ignore_missing_imports = True

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,6 +31,7 @@ repos:
      additional_dependencies:
        [
            'types-PyYAML',
+            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
    rev: 1.7.5
--- a/README.md
+++ b/README.md
@@ -13,6 +13,9 @@ Features:
 - Log results and optionally checkpoints to wandb or mlflow
 - And more!

+<a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
+  <img alt="phorm.ai" src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=">
+</a>

 <table>
 <tr>
@@ -22,21 +25,26 @@ Features:
 - [Introduction](#axolotl)
 - [Supported Features](#axolotl-supports)
 - [Quickstart](#quickstart-)
- [Installation](#installation)
+- [Environment](#environment)
  - [Docker](#docker)
  - [Conda/Pip venv](#condapip-venv)
-  - [Cloud GPU](#cloud-gpu) - Runpod, Latitude
-  - [LambdaLabs](#lambdalabs)
+  - [Cloud GPU](#cloud-gpu) - Latitude.sh, JarvisLabs, RunPod
+  - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
  - [Windows](#windows)
+  - [Mac](#mac)
+  - [Google Colab](#google-colab)
  - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
  - [How to Add Custom Prompts](#how-to-add-custom-prompts)
  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
 - [Config](#config)
  - [Train](#train)
-  - [Inference](#inference)
+  - [Inference](#inference-playground)
  - [Merge LORA to Base](#merge-lora-to-base)
  - [Special Tokens](#special-tokens)
+- Advanced Topics
+  - [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
 - [Common Errors](#common-errors-)
  - [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
 - [Debugging Axolotl](#debugging-axolotl)
@@ -84,17 +92,18 @@ Features:
 | phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
 | RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
 | Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |

+✅: supported
+❌: not supported
+❓: untested

 ## Quickstart ⚡

 Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.

-**Requirements**: Python >=3.9 and Pytorch >=2.0.
+**Requirements**: Python >=3.10 and Pytorch >=2.1.1.

-`pip3 install "axolotl[flash-attn,deepspeed] @ git+https://github.com/OpenAccess-AI-Collective/axolotl"`
-
-### For developers
 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl
@@ -105,6 +114,9 @@ pip3 install -e '.[flash-attn,deepspeed]'

 ### Usage
 ```bash
+# preprocess datasets - optional but recommended
+CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/openllama-3b/lora.yml
+
 # finetune lora
 accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml

@@ -115,15 +127,20 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
 # gradio
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
    --lora_model_dir="./lora-out" --gradio
+
+# remote yaml files - the yaml config can be hosted on a public URL
+# Note: the yaml config must directly link to the **raw** yaml
+accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/examples/openllama-3b/lora.yml
 ```

-## Installation
+## Advanced Setup

 ### Environment

 #### Docker
+
  ```bash
-  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
+  docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
  ```

  Or run on the current files for development:
@@ -133,7 +150,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
  ```

 >[!Tip]
-> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.md#debugging-with-docker).
+> If you want to debug axolotl or prefer to use Docker as your development environment, see the [debugging guide's section on Docker](docs/debugging.qmd#debugging-with-docker).

  <details>

@@ -142,7 +159,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
  A more powerful Docker command to run would be this:

  ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-latest
  ```

  It additionally:
@@ -157,7 +174,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
  </details>

 #### Conda/Pip venv
-  1. Install python >=**3.9**
+  1. Install python >=**3.10**

  2. Install pytorch stable https://pytorch.org/get-started/locally/

@@ -176,9 +193,14 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --

 For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)

+- on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
+- on JarvisLabs.ai use this [direct link](https://jarvislabs.ai/templates/axolotl)
 - on RunPod use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)

-#### LambdaLabs
+#### Bare Metal Cloud GPU
+
+##### LambdaLabs
+
  <details>

  <summary>Click to Expand</summary>
@@ -186,11 +208,11 @@ For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud
  1. Install python
  ```bash
  sudo apt update
-  sudo apt install -y python3.9
+  sudo apt install -y python3.10

-  sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1
-  sudo update-alternatives --config python # pick 3.9 if given option
-  python -V # should be 3.9
+  sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
+  sudo update-alternatives --config python # pick 3.10 if given option
+  python -V # should be 3.10

  ```

@@ -222,21 +244,50 @@ For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud
  ```
  </details>

+##### GCP
+
+<details>
+
+<summary>Click to Expand</summary>
+
+Use a Deeplearning linux OS with cuda and pytorch installed. Then follow instructions on quickstart.
+
+Make sure to run the below to uninstall xla.
+```bash
+pip uninstall -y torch_xla[tpu]
+```
+
+</details>
+
 #### Windows
 Please use WSL or Docker!

+#### Mac
+
+Use the below instead of the install method in QuickStart.
+```
+pip3 install -e '.'
+```
+More info: [mac.md](/docs/mac.qmd)
+
+#### Google Colab
+
+Please use this example [notebook](examples/colab-notebooks/colab-axolotl-example.ipynb).

 #### Launching on public clouds via SkyPilot
 To launch on GPU instances (both on-demand and spot instances) on 7+ clouds (GCP, AWS, Azure, OCI, and more), you can use [SkyPilot](https://skypilot.readthedocs.io/en/latest/index.html):
+
 ```bash
 pip install "skypilot-nightly[gcp,aws,azure,oci,lambda,kubernetes,ibm,scp]"  # choose your clouds
 sky check
 ```
+
 Get the [example YAMLs](https://github.com/skypilot-org/skypilot/tree/master/llm/axolotl) of using Axolotl to finetune `mistralai/Mistral-7B-v0.1`:
 ```
 git clone https://github.com/skypilot-org/skypilot.git
 cd skypilot/llm/axolotl
 ```
+
 Use one command to launch:
 ```bash
 # On-demand
@@ -246,32 +297,33 @@ HF_TOKEN=xx sky launch axolotl.yaml --env HF_TOKEN
 HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKEN --env BUCKET
 ```

-
 ### Dataset

 Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
 Have dataset(s) in one of the following format (JSONL recommended):

- `alpaca`: instruction; input(optional)
-  ```json
-  {"instruction": "...", "input": "...", "output": "..."}
-  ```
- `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: `system` to override default system prompt)
-  ```json
-  {"conversations": [{"from": "...", "value": "..."}]}
-  ```
- `llama-2`: the json is the same format as `sharegpt` above, with the following config (see the [config section](#config) for more details)
-    ```yml
-    datasets:
-      - path: <your-path>
-        type: sharegpt
-        conversation: llama-2
-    ```
+#### Pretraining
+
 - `completion`: raw corpus
  ```json
  {"text": "..."}
  ```

+Note: Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
+
+```yaml
+pretraining_dataset: # hf path only
+```
+
+#### Supervised finetuning
+
+##### Instruction
+
+- `alpaca`: instruction; input(optional)
+  ```json
+  {"instruction": "...", "input": "...", "output": "..."}
+  ```
+
 <details>

 <summary>See other formats</summary>
@@ -348,14 +400,37 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
  ```
- `pygmalion`: pygmalion
-  ```json
-  {"conversations": [{"role": "...", "value": "..."}]}
-  ```
 - `metharme`: instruction, adds additional eos tokens
  ```json
  {"prompt": "...", "generation": "..."}
  ```
+
+</details>
+
+##### Template-Free
+
+- `input_output`: template-free prompt construction
+  ```json
+   {"segments": [{"label": true|false, "text": "..."}]}
+  ```
+
+This is a special format that allows you to construct prompts without using templates. This is for advanced users who want more freedom with prompt construction.  See [these docs](docs/input_output.qmd) for more details.
+
+##### Conversation
+
+- `sharegpt`: conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
+  ```json
+  {"conversations": [{"from": "...", "value": "..."}]}
+  ```
+
+<details>
+
+<summary>See other formats</summary>
+
+- `pygmalion`: pygmalion
+  ```json
+  {"conversations": [{"role": "...", "value": "..."}]}
+  ```
 - `sharegpt.load_role`: conversations where `role` is used instead of `from`
  ```json
  {"conversations": [{"role": "...", "value": "..."}]}
@@ -371,6 +446,8 @@ Have dataset(s) in one of the following format (JSONL recommended):

 </details>

+Note: `type: sharegpt` opens a special config `conversation:` that enables conversions to many Conversation types. See dataset section under [all yaml options](#all-yaml-options).
+
 #### How to add custom prompts

 For a dataset that is preprocessed for instruction purposes:
@@ -392,12 +469,16 @@ datasets:
      format: "[INST] {instruction} [/INST]"
      no_input_format: "[INST] {instruction} [/INST]"
 ```
+See full config options under [all yaml options](#all-yaml-options).

 #### How to use your custom pretokenized dataset

 - Do not pass a `type:`
 - Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`

+```yaml
+- path: ...
+```

 ### Config

@@ -411,22 +492,18 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod

 - dataset
  ```yaml
-  sequence_len: 2048 # max token length for prompt
-
-  # huggingface repo
  datasets:
+      # huggingface repo
    - path: vicgalle/alpaca-gpt4
-      type: alpaca # format from earlier
+      type: alpaca

-  # huggingface repo with specific configuration/subset
-  datasets:
+      # huggingface repo with specific configuration/subset
    - path: EleutherAI/pile
      name: enron_emails
      type: completion # format from earlier
      field: text # Optional[str] default: text, field to use for completion data

-  # huggingface repo with multiple named configurations/subsets
-  datasets:
+      # huggingface repo with multiple named configurations/subsets
    - path: bigcode/commitpackft
      name:
        - ruby
@@ -434,39 +511,42 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
        - typescript
      type: ... # unimplemented custom format

-  # fastchat conversation
-  # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-  datasets:
+      # fastchat conversation
+      # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    - path: ...
      type: sharegpt
-      conversation: chatml
+      conversation: chatml # default: vicuna_v1.1

-  # local
-  datasets:
+      # local
    - path: data.jsonl # or json
      ds_type: json # see other options below
      type: alpaca

-  # dataset with splits, but no train split
-  dataset:
+      # dataset with splits, but no train split
    - path: knowrohit07/know_sql
      type: context_qa.load_v2
      train_on_split: validation

-  # loading from s3 or gcs
-  # s3 creds will be loaded from the system default and gcs only supports public access
-  dataset:
+      # loading from s3 or gcs
+      # s3 creds will be loaded from the system default and gcs only supports public access
    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
      ...
+
+      # Loading Data From a Public URL
+      # - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
+    - path: https://some.url.com/yourdata.jsonl # The URL should be a direct link to the file you wish to load. URLs must use HTTPS protocol, not HTTP.
+      ds_type: json # this is the default, see other options below.
  ```

 - loading
  ```yaml
  load_in_4bit: true
  load_in_8bit: true
-  bf16: true # require >=ampere
-  fp16: true
+
+  bf16: auto # require >=ampere, auto will detect if your GPU supports this and choose automatically.
+  fp16: # leave empty to use fp16 when bf16 is 'auto'. set to false if you want to fallback to fp32
  tf32: true # require >=ampere
+
  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
  float16: true # use instead of fp16 when you don't want AMP
  ```
@@ -474,7 +554,7 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod

 - lora
  ```yaml
-  adapter: lora # qlora or leave blank for full finetune
+  adapter: lora # 'qlora' or leave blank for full finetune
  lora_r: 8
  lora_alpha: 16
  lora_dropout: 0.05
@@ -483,9 +563,9 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
    - v_proj
  ```

-<details>
+<details id="all-yaml-options">

-<summary>All yaml options (click me)</summary>
+<summary>All yaml options (click to expand)</summary>

 ```yaml
 # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
@@ -497,8 +577,8 @@ base_model_ignore_patterns:
 # You can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
 # You can specify to choose a specific model revision from huggingface hub
-model_revision:
-# Optional tokenizer configuration override in case you want to use a different tokenizer
+revision_of_model:
+# Optional tokenizer configuration path in case you want to use a different tokenizer
 # than the one defined in the base model
 tokenizer_config:
 # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
@@ -515,15 +595,16 @@ tokenizer_legacy:
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:

+# (Internal use only)
 # Used to identify which the model is based on
 is_falcon_derived_model:
 is_llama_derived_model:
+is_qwen_derived_model:
 # Please note that if you set this to true, `padding_side` will be set to "left" by default
 is_mistral_derived_model:
-is_qwen_derived_model:

 # optional overrides to the base model configuration
-model_config:
+overrides_of_model_config:
  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
  rope_scaling:
    type: # linear | dynamic
@@ -540,8 +621,6 @@ bnb_config_kwargs:

 # Whether you are training a 4-bit GPTQ quantized model
 gptq: true
-gptq_groupsize: 128 # group size
-gptq_model_v1: false # v1 or v2

 # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 load_in_8bit: true
@@ -577,9 +656,13 @@ datasets:
    train_on_split: train # Optional[str] name of dataset split to load from

    # Optional[str] fastchat conversation type, only used with type: sharegpt
-    conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    field_human: # Optional[str]. Human key to use for conversation.
    field_model: # Optional[str]. Assistant key to use for conversation.
+    # Add additional keys from your dataset as input or output roles
+    roles:
+      input: # Optional[List[str]]. These will be masked based on train_on_input
+      output: # Optional[List[str]].

  # Custom user instruction prompt
  - path: repo
@@ -604,12 +687,29 @@ datasets:
      # For `completion` datsets only, uses the provided field instead of `text` column
      field:

-# use RL training: dpo, ipo, kto_pair
+# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
+# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
+shuffle_merged_datasets: true
+
+# A list of one or more datasets to eval the model with.
+# You can use either test_datasets, or val_set_size, but not both.
+test_datasets:
+  - path: /workspace/data/eval.jsonl
+    ds_type: json
+    # You need to specify a split. For "json" datasets the default split is called "train".
+    split: train
+    type: completion
+    data_files:
+      - /workspace/data/eval.jsonl
+
+# use RL training: 'dpo', 'ipo', 'kto_pair'
 rl:

 # Saves the desired chat template to the tokenizer_config.json for easier inferencing
 # Currently supports chatml and inst (mistral/mixtral)
 chat_template: chatml
+# Changes the default system message
+default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -618,8 +718,11 @@ push_dataset_to_hub: # repo path
 # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # if not set.
 dataset_processes: # defaults to os.cpu_count() if not set
+# Keep dataset in memory while preprocessing
+# Only needed if cached dataset is taking too much storage
+dataset_keep_in_memory:
 # push checkpoints to hub
-hub_model_id: # repo path to push finetuned model
+hub_model_id: # private repo path to push finetuned model
 # how to push checkpoints to hub
 # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
 hub_strategy:
@@ -639,10 +742,6 @@ sequence_len: 2048
 # Pad inputs so each step uses constant sized buffers
 # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
 pad_to_sequence_len:
-# Max sequence length to concatenate training samples together up to
-# Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
-# FutureWarning: This will soon be DEPRECATED
-max_packed_sequence_len: 1024
 # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
 sample_packing:
 # Set to 'false' if getting errors during eval with sample_packing on.
@@ -692,10 +791,18 @@ lora_modules_to_save:

 lora_fan_in_fan_out: false

+peft:
+  # Configuration options for loftq initialization for LoRA
+  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
+  loftq_config:
+    loftq_bits:  # typically 4 bits
+
 # ReLoRA configuration
 # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
 relora_steps: # Number of steps per ReLoRA restart
 relora_warmup_steps: # Number of per-restart warmup steps
+relora_anneal_steps: # Number of anneal steps for each relora cycle
+relora_prune_ratio: # threshold for optimizer magnitude when pruning
 relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings

 # wandb configuration if you're using it
@@ -711,6 +818,7 @@ wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_step
 # mlflow configuration if you're using it
 mlflow_tracking_uri: # URI to mlflow
 mlflow_experiment_name: # Your experiment name
+hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry

 # Where to save the full-finetuned model to
 output_dir: ./completed-model
@@ -744,7 +852,8 @@ save_total_limit: # Checkpoints saved at a time
 max_steps:

 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
-eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", chrf]

 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
@@ -763,7 +872,7 @@ group_by_length: false
 gradient_checkpointing: false
 # additional kwargs to pass to the trainer for gradient checkpointing
 # gradient_checkpointing_kwargs:
-#   use_reentrant: false
+#   use_reentrant: true

 # Stop training after this many evaluation losses have increased in a row
 # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
@@ -773,14 +882,11 @@ early_stopping_patience: 3
 lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
+cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)

 # For one_cycle optim
 lr_div_factor: # Learning rate div factor

-# For log_sweep optim
-log_sweep_min_lr:
-log_sweep_max_lr:
-
 # Specify optimizer
 # Valid values are driven by the Transformers OptimizerNames class, see:
 # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
@@ -806,7 +912,26 @@ log_sweep_max_lr:
 # - paged_adamw_8bit
 # - paged_lion_32bit
 # - paged_lion_8bit
+# - galore_adamw
+# - galore_adamw_8bit
+# - galore_adafactor
+# - galore_adamw_layerwise
+# - galore_adamw_8bit_layerwise
+# - galore_adafactor_layerwise
 optimizer:
+# Dictionary of arguments to pass to the optimizer
+optim_args:
+# For Galore Optimizers the following optim_args are available
+# rank:  # type: int
+# update_proj_gap  # type: int
+# scale  # type: float
+# proj_type:  # type: str, default = std
+
+# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
+optim_target_modules:
+# - self_attn  # for llama
+# - mlp
+
 # Specify weight decay
 weight_decay:
 # adamw hyperparams
@@ -834,7 +959,8 @@ flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
 # Whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:
-
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+s2_attention:
 # Resume from a specific checkpoint dir
 resume_from_checkpoint:
 # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
@@ -858,7 +984,7 @@ tokens:
 fsdp:
 fsdp_config:

-# Deepspeed config path. e.g., deepspeed/zero3.json
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
 deepspeed:

 # Advanced DDP Arguments
@@ -951,6 +1077,9 @@ Run
 accelerate launch -m axolotl.cli.train your_config.yml
 ```

+> [!TIP]
+> You can also reference a config file that is hosted on a public URL, for example `accelerate launch -m axolotl.cli.train https://yourdomain.com/your_config.yml`
+
 #### Preprocess dataset

 You can optionally pre-tokenize dataset with the following before finetuning.
@@ -979,11 +1108,11 @@ for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usa
 We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.

 ```yaml
-deepspeed: deepspeed/zero1.json
+deepspeed: deepspeed_configs/zero1.json
 ```

 ```shell
-accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
+accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed_configs/zero1.json
 ```

 ##### FSDP
@@ -999,6 +1128,10 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

+##### FSDP + QLoRA
+
+Axolotl supports training with FSDP and QLoRA, see [these docs](docs/fsdp_qlora.qmd) for more information.
+
 ##### Weights & Biases Logging

 Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
@@ -1060,7 +1193,7 @@ Please use `--sample_packing False` if you have it on and receive the error simi

 ### Merge LORA to base

-The following command will merge your LORA adapater with your base model.  You can optionally pass the argument `--lora_model_dir` to specify the directory where your LORA adapter was saved, otherwhise, this will be inferred from `output_dir` in your axolotl config file.  The merged model is saved in the sub-directory `{lora_model_dir}/merged`.
+The following command will merge your LORA adapater with your base model. You can optionally pass the argument `--lora_model_dir` to specify the directory where your LORA adapter was saved, otherwhise, this will be inferred from `output_dir` in your axolotl config file.  The merged model is saved in the sub-directory `{lora_model_dir}/merged`.

 ```bash
 python3 -m axolotl.cli.merge_lora your_config.yml --lora_model_dir="./completed-model"
@@ -1076,7 +1209,7 @@ although this will be very slow, and using the config options above are recommen

 ## Common Errors 🧰

-See also the [FAQ's](./docs/faq.md) and [debugging guide](docs/debugging.md).
+See also the [FAQ's](./docs/faq.qmd) and [debugging guide](docs/debugging.qmd).

 > If you encounter a 'Cuda out of memory' error, it means your GPU ran out of memory during the training process. Here's how to resolve it:

@@ -1110,7 +1243,7 @@ It's safe to ignore it.

 > NCCL Timeouts during training

-See the [NCCL](docs/nccl.md) guide.
+See the [NCCL](docs/nccl.qmd) guide.


 ### Tokenization Mismatch b/w Inference & Training
@@ -1121,18 +1254,20 @@ If you decode a prompt constructed by axolotl, you might see spaces between toke

 1. Materialize some data using `python -m axolotl.cli.preprocess your_config.yml --debug`, and then decode the first few rows with your model's tokenizer.
 2. During inference, right before you pass a tensor of token ids to your model, decode these tokens back into a string.
-3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines.  If they aren't the same adjust your inference server accordingly.
-4. As an additional troubleshooting step, you can look look at the token ids between 1 and 2 to make sure they are identical.
+3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines.  If they aren't the same, adjust your inference server accordingly.
+4. As an additional troubleshooting step, you can look at the token ids between 1 and 2 to make sure they are identical.

 Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this.  See [this blog post](https://hamel.dev/notes/llm/05_tokenizer_gotchas.html) for a concrete example.

 ## Debugging Axolotl

-See [this debugging guide](docs/debugging.md) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.
+See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.

-## Need help? 🙋♂️
+## Need help? 🙋

-Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
+Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we our community members can help you.
+
+Need dedicated support? Please contact us at [✉️wing@openaccessaicollective.org](mailto:wing@openaccessaicollective.org) for dedicated support options.

 ## Badge ❤🏷️

@@ -1149,7 +1284,7 @@ Building something cool with Axolotl? Consider adding a badge to your model card
 Check out some of the projects and models that have been built using Axolotl! Have a model you'd like to add to our Community Showcase? Open a PR with your model.

 Open Access AI Collective
- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b)
+- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b-fixed)
 - [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
 - [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)

@@ -1166,13 +1301,28 @@ PRs are **greatly welcome**!

 Please run below to setup env
 ```bash
+git clone https://github.com/OpenAccess-AI-Collective/axolotl
+cd axolotl
+
+pip3 install packaging
+pip3 install -e '.[flash-attn,deepspeed]'
+
 pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

 # test
 pytest tests/
+
+# optional: run against all files
+pre-commit run --all-files
 ```

+Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
+
+<a href="https://github.com/openaccess-ai-collective/axolotl/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
+</a>
+
 ## Sponsors 🤝❤

 OpenAccess AI Collective is run by volunteer contributors such as [winglian](https://github.com/winglian),
@@ -1201,4 +1351,6 @@ consider sponsoring the project via [GitHub Sponsors](https://github.com/sponsor

 #### 🥉 Bronze Sponsors - $500/mo

+ - [JarvisLabs.ai](https://jarvislabs.ai)
+
 ---
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -0,0 +1,51 @@
+project:
+  type: website
+
+website:
+  title: "Axolotl"
+  description: "Fine-tuning"
+  favicon: favicon.jpg
+  navbar:
+    title: Axolotl
+    background: dark
+    pinned: false
+    collapse: false
+    tools:
+    - icon: twitter
+      href: https://twitter.com/axolotl_ai
+    - icon: github
+      href: https://github.com/OpenAccess-AI-Collective/axolotl/
+    - icon: discord
+      href: https://discord.gg/7m9sfhzaf3
+
+  sidebar:
+      pinned: true
+      collapse-level: 2
+      style: docked
+      contents:
+        - text: Home
+          href: index.qmd
+        - section: "How-To Guides"
+          contents:
+          # TODO Edit folder structure after we have more docs.
+            - docs/debugging.qmd
+            - docs/multipack.qmd
+            - docs/fdsp_qlora.qmd
+            - docs/input_output.qmd
+            - docs/rlhf.qmd
+            - docs/nccl.qmd
+            - docs/mac.qmd
+            - docs/multi-node.qmd
+        - section: "Reference"
+          contents:
+            - docs/config.qmd
+        - docs/faq.qmd
+
+
+
+
+format:
+  html:
+    theme: materia
+    css: styles.css
+    toc: true
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -0,0 +1,40 @@
+FROM winglian/axolotl-base:{{ BASE_TAG }}
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
+ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
+ENV CUDA="{{ CUDA }}"
+ENV BNB_CUDA_VERSION="{{ CUDA }}"
+ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
+ENV GITHUB_REF="{{ GITHUB_REF }}"
+ENV GITHUB_SHA="{{ GITHUB_SHA }}"
+
+RUN apt-get update && \
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
+
+WORKDIR /workspace
+
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
+
+WORKDIR /workspace/axolotl
+
+RUN git fetch origin +$GITHUB_REF && \
+    git checkout FETCH_HEAD
+
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN pip install causal_conv1d
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+    else \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
+    fi
+
+# So we can test the Docker image
+RUN pip install pytest
+
+# fix so that git fetch/pull from remote works
+RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
+pytest /workspace/axolotl/tests/e2e/patched/
+pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -0,0 +1,75 @@
+"""
+ modal application to run axolotl gpu tests in Modal
+ """
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import Image, Stub
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
+    "CUDA": os.environ.get("CUDA", "118"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = (
+    Image.from_dockerfile(
+        pathlib.Path(temp_dir) / "Dockerfile",
+        force_build=True,
+        gpu="A10G",
+    )
+    .env(df_args)
+    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
+)
+
+stub = Stub("Axolotl CI/CD", secrets=[])
+
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+
+
+@stub.function(
+    image=cicd_image,
+    gpu=GPU_CONFIG,
+    timeout=45 * 60,
+    cpu=8.0,
+    memory=131072,
+)
+def cicd_pytest():
+    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
+
+
+@stub.local_entrypoint()
+def main():
+    cicd_pytest.remote()
--- a/deepspeed_configs/zero1.json
+++ b/deepspeed_configs/zero1.json
@@ -15,16 +15,8 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero2.json
+++ b/deepspeed_configs/zero2.json
@@ -19,16 +19,8 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
@@ -23,16 +23,8 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -23,16 +23,8 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/devtools/README.md
+++ b/devtools/README.md
@@ -1 +1 @@
-This directory contains example config files that might be useful for debugging. Please see [docs/debugging.md](../docs/debugging.md) for more information.
+This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
--- a/devtools/dev_sharegpt.yml
+++ b/devtools/dev_sharegpt.yml
@@ -2,7 +2,6 @@
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -3,9 +3,10 @@ FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
-ARG PYTORCH_VERSION="2.0.1"
+ARG PYTORCH_VERSION="2.1.2"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION

@@ -19,10 +20,11 @@ RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
+RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -7,8 +7,8 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION a

 ENV PATH="/root/miniconda3/bin:${PATH}"

-ARG PYTHON_VERSION="3.9"
-ARG PYTORCH_VERSION="2.0.1"
+ARG PYTHON_VERSION="3.10"
+ARG PYTORCH_VERSION="2.1.2"
 ARG CUDA="118"
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} deepspeed-kernels --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA

 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -7,14 +7,19 @@ ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"

-COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
+EXPOSE 8888
+EXPOSE 22

-RUN pip install jupyterlab notebook && \
+COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
+COPY scripts/motd /etc/motd
+
+RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
 RUN apt install --yes --no-install-recommends openssh-server tmux && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
+    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
    chmod +x /root/cloud-entrypoint.sh

--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -3,9 +3,10 @@ FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
-ARG PYTORCH_VERSION="2.0.1"
+ARG PYTORCH_VERSION="2.1.2"
 ARG GITHUB_REF="main"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION
@@ -24,9 +25,9 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -0,0 +1,2 @@
+/.quarto/
+_site/
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -0,0 +1,17 @@
+---
+title: Config options
+description: A complete list of all configuration options.
+---
+
+```{python}
+#|echo: false
+#|output: asis
+import re
+# Regex pattern to match the YAML block including its code fence
+pattern = r'<details[^>]*id="all-yaml-options"[^>]*>.*?<summary>All yaml options.*?```yaml(.*?)```.*?</details>'
+
+with open('../README.md', 'r') as f:
+    doc = f.read()
+match = re.search(pattern, doc, re.DOTALL)
+print("```yaml", match.group(1).strip(), "```", sep="\n")
+```
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -1,4 +1,8 @@
-# Debugging Axolotl
+---
+title: Debugging
+description: How to debug Axolotl
+---
+

 This document provides some tips and tricks for debugging Axolotl.  It also provides an example configuration for debugging with VSCode.  A good debugging setup is essential to understanding how Axolotl code works behind the scenes.

@@ -74,7 +78,6 @@ pip3 install -e '.[flash-attn,deepspeed]'

 If you developing on a remote host, you can easily use VSCode to debug remotely.  To do so, you will need to follow this [remote - SSH guide](https://code.visualstudio.com/docs/remote/ssh).  You can also see the video below on [Docker and Remote SSH debugging](#video---attaching-to-docker-on-remote-host).

-```bash

 ### Configuration

--- a/docs/faq.md
+++ b/docs/faq.md
@@ -1,18 +0,0 @@
-# Axolotl FAQ's
-
-
-> The trainer stopped and hasn't progressed in several minutes.
-
-Usually an issue with the GPU's communicating with each other. See the [NCCL doc](../docs/nccl.md)
-
-> Exitcode -9
-
-This usually happens when you run out of system RAM.
-
-> Exitcode -7 while using deepspeed
-
-Try upgrading deepspeed w: `pip install -U deepspeed`
-
-> AttributeError: 'DummyOptim' object has no attribute 'step'
-
-You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -0,0 +1,21 @@
+---
+title: FAQ
+description: Frequently asked questions
+---
+
+
+**Q: The trainer stopped and hasn't progressed in several minutes.**
+
+> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)
+
+**Q: Exitcode -9**
+
+> A: This usually happens when you run out of system RAM.
+
+**Q: Exitcode -7 while using deepspeed**
+
+> A: Try upgrading deepspeed w: `pip install -U deepspeed`
+
+**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
+
+> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -0,0 +1,43 @@
+---
+title: FDSP + QLoRA
+description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
+format:
+  html:
+    toc: true
+---
+
+## Background
+
+Using FSDP with QLoRA is essential for **fine-tuning larger (70b+ parameter) LLMs on consumer GPUs.**  For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs[^1].
+
+Below, we describe how to use this feature in Axolotl.
+
+## Usage
+
+To enable `QLoRA` with `FSDP`, you need to perform the following steps:
+
+> ![Tip]
+> See the [example config](#example-config) file in addition to reading these instructions.
+
+1. Set `adapter: qlora` in your axolotl config file.
+2. Enable FSDP in your axolotl config, as [described here](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#fsdp).
+3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.
+
+## Example Config
+
+[examples/llama-2/qlora-fsdp.yml](../examples/llama-2/qlora-fsdp.yml) contains an example of how to enable QLoRA + FSDP in axolotl.
+
+## References
+
+- [PR #1378](https://github.com/OpenAccess-AI-Collective/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl.
+- [Blog Post](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the [Answer.AI](https://www.answer.ai/) team describing the work that enabled QLoRA in FSDP.
+- Related HuggingFace PRs Enabling FDSP + QLoRA:
+    - Accelerate [PR#2544](https://github.com/huggingface/accelerate/pull/2544 )
+    - Transformers [PR#29587](https://github.com/huggingface/transformers/pull/29587)
+    - TRL [PR#1416](https://github.com/huggingface/trl/pull/1416)
+    - PEFT [PR#1550](https://github.com/huggingface/peft/pull/1550)
+
+
+
+
+[^1]: This was enabled by [this work](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the Answer.AI team.
--- a/docs/images/4d-mask.png
+++ b/docs/images/4d-mask.png
--- a/docs/input_output.qmd
+++ b/docs/input_output.qmd
@@ -0,0 +1,263 @@
+---
+title: Template-free prompt construction
+description: "Template-free prompt construction with the `input_output` format"
+---
+
+<!-- TOC -->
+
+- [Background](#background)
+    - [Masking Inputs](#masking-inputs)
+    - [You may not want prompt templates](#you-may-not-want-prompt-templates)
+    - [The `input_output` format](#the-input_output-format)
+- [Usage](#usage)
+    - [1. Prepare Data](#1-prepare-data)
+    - [2. Use `type: input_output`](#2-use-type-input_output)
+    - [3. Check the prompts](#3-check-the-prompts)
+
+<!-- /TOC -->
+
+<a id="markdown-background" name="background"></a>
+
+## Background
+
+<a id="markdown-masking-inputs" name="masking-inputs"></a>
+
+### Masking Inputs
+
+One of the most popular features of
+[axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) is
+setting the following configuration value:
+
+
+```yaml
+train_on_inputs: false
+```
+
+If you declare a [dataset formats](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#dataset)
+such as `alpaca` or `chatml`, axolotl knows what is an input
+(i.e. human) vs. an output (i.e. the assistant) and masks the input
+labels so that your model can focus on predicting the outputs only.
+
+<a id="markdown-you-may-not-want-prompt-templates" name="you-may-not-want-prompt-templates"></a>
+
+### You may not want prompt templates
+
+However, there are many situations where you don't want to use one of
+these formats or templates (I usually don't!). This is because they can:
+
+-   Add unnecessary boilerplate to your prompts.
+-   Create artifacts like special delimiters `<|im_start|>` that can
+    quickly become footguns if you don't include them correctly at
+    inference time.
+-   Enforce a *chat* interface when you do not want one. Sometimes you
+    just want to fine-tune a model to a very specific task and do NOT
+    want multi-turn conversations, roles, etc.
+-   Limit you to only certain roles that the template allows.
+
+<a id="markdown-the-inputoutput-format" name="the-inputoutput-format"></a>
+
+### The `input_output` format
+
+You can construct your prompts without a template by using the
+`input_output` format, by setting `type: input_output` in your
+configuration file like this:
+
+**config.yml**
+
+```yaml
+train_on_inputs: false # Mask segments of your data
+datasets:
+  - path: output.jsonl
+    type: input_output  # use template free prompt construction
+```
+
+Unlike `type: completion`, which is also template-free,
+`type: input_output` allows you to mask segments of your text. More
+details on how this works are described below.
+
+<a id="markdown-usage" name="usage"></a>
+
+## Usage
+
+This is how you can use the `input_output` format:
+
+<a id="markdown-1-prepare-data" name="1-prepare-data"></a>
+
+### 1. Prepare Data
+
+To use the `input_output` format, collect your data in the following
+format into a jsonl file (below is the first row from the file
+`output`.jsonl` pretty printed):
+
+```bash
+$ head -n1 output.jsonl | python -m json.tool
+
+{.cell-output .cell-output-stdout}
+    {
+        "segments": [
+            {
+                "label": true,
+                "text": "<s>Hello\n"
+            },
+            {
+                "label": true,
+                "text": "hi there!. "
+            },
+            {
+                "label": false,
+                "text": "goodbye "
+            },
+            {
+                "label": true,
+                "text": "farewell</s>"
+            }
+        ]
+    }
+```
+
+Set `label:false` when you want to mask a segment of text so that the
+model isn't trained on it. Some things to keep in mind:
+
+> [!IMPORTANT]
+> 1.  **EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl
+    concatenates all the segments as-is.** The tokenizer doesn't add
+    anything additional. Notice how I added spaces, newlines, `<s>`
+    (BOS), and `</s>` (EOS) myself.
+> 2.  Make sure you check the materialized output to validate that the
+    prompt is getting assembled how you like.
+
+<a id="markdown-2-use-type-inputoutput" name="2-use-type-inputoutput"></a>
+
+### 2. Use `type: input_output`
+
+Let's materialize data with our `output.jsonl` file by setting
+`type: input_output` in our axolotl config:
+
+```yaml
+# training_config.yaml
+base_model: mistralai/Mistral-7B-v0.1
+data_seed: 49
+seed: 49
+
+datasets:
+  - path: output.jsonl
+    type: input_output
+val_set_size: 0.1
+
+sequence_len: 896
+sample_packing: false
+
+micro_batch_size: 2
+gradient_accumulation_steps: 3
+eval_batch_size: 2
+num_epochs: 1
+learning_rate: 0.0002
+
+train_on_inputs: false
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+```
+
+You can use the following command to materialize your data. The
+`--debug` flag will print the tokens, along with the labels so you can
+verify that the correct items are being ignored:
+
+```bash
+$ python -m axolotl.cli.preprocess training_config.yaml --debug
+
+...
+[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] <s>(1, 1) Hello(22557, 22557)
+(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) </s>(2, 2)
+
+```
+
+The format is `decoded_token`(`label`, `token_id`), for example,
+`<s>(1, 1)` means that the token is `<s>`, the label is `1` and the
+token_id is `1`. When the label is `-100` then that token is ignored for
+training.
+
+<a id="markdown-3-check-the-prompts" name="3-check-the-prompts"></a>
+
+### 3. Check the prompts
+
+Here is another way to check the materialized output:
+
+```python
+from transformers import AutoTokenizer
+from datasets import load_from_disk
+import yaml
+
+directory = !ls last_run_prepared/
+with open('training_config.yaml', 'r') as f:
+    cfg = yaml.safe_load(f)
+model_id = cfg['base_model']
+tok = AutoTokenizer.from_pretrained(model_id)
+ds = load_from_disk(f'last_run_prepared/{directory[0]}/')
+```
+
+```python
+>>> row = ds[0]
+>>> print(tok.decode(row['input_ids']))
+<s> Hello
+    hi there!.  goodbye  farewell</s>
+```
+
+We can check that the right tokens are ingored by comparing the labels
+to each token:
+
+```python
+import pandas as pd
+pd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in
+              zip(row['input_ids'], row['labels'])])
+```
+
+| token | label | id    |
+|-------|-------|-------|
+| 0     | \<s\> | 1     |
+| 1     | Hello | 22557 |
+| 2     | \\n   | 13    |
+| 3     | hi    | 12014 |
+| 4     | there | 736   |
+| 5     | !     | 28808 |
+| 6     | .     | 28723 |
+| 7     |       | 28705 |
+| 8     | good  | -100  |
+| 9     | bye   | -100  |
+| 10    |       | -100  |
+| 11    | fare  | 19111 |
+| 12    | well  | 5458  |
+| 13    | \</s\>| 2     |
+
+
+
+If we look at the input data, the above table seems correct! (The jsonl
+version is repeated below for reference):
+
+
+```bash
+$ head -n1 output.jsonl | python -m json.tool
+
+{.cell-output .cell-output-stdout}
+    {
+        "segments": [
+            {
+                "label": true,
+                "text": "<s>Hello\n"
+            },
+            {
+                "label": true,
+                "text": "hi there!. "
+            },
+            {
+                "label": false,
+                "text": "goodbye "
+            },
+            {
+                "label": true,
+                "text": "farewell</s>"
+            }
+        ]
+    }
+```
--- a/docs/mac.qmd
+++ b/docs/mac.qmd
@@ -0,0 +1,22 @@
+---
+title: Mac M-series
+description: Mac M-series support
+---
+
+Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.
+
+Current support:
+
+- [x] Support for all models
+- [x] Full training of models
+- [x] LoRA training
+- [x] Sample packing
+- [ ] FP16 and BF16 (awaiting AMP support for MPS in Pytorch)
+- [ ] Tri-dao's flash-attn (until it is supported use spd_attention as an alternative)
+- [ ] xformers
+- [ ] bitsandbytes (meaning no 4/8 bits loading and bnb optimizers)
+- [ ] qlora
+- [ ] DeepSpeed
+
+Untested:
+- FSDP
--- a/docs/multi-node.qmd
+++ b/docs/multi-node.qmd
@@ -1,4 +1,7 @@
-# Multi Node
+---
+title: Multi Node
+description: How to use Axolotl on multiple machines
+---

 You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:

--- a/docs/multipack.md
+++ b/docs/multipack.md
@@ -1,51 +0,0 @@
-# Multipack
-
-4k context, bsz =4,
-each character represents 256 tokens
-X represents a padding token
-
-```
-   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
-[[ A A A A A A A A A A A ]
-   B B B B B B ]
-   C C C C C C C ]
-   D D D D ]]
-
-[[ E E E E E E E E ]
- [ F F F F ]
- [ G G G ]
- [ H H H H ]]
-
-[[ I I I ]
- [ J J J ]
- [ K K K K K]
- [ L L L ]]
-```
-
-after padding to longest input in each step
-```
-   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
-[[ A A A A A A A A A A A ]
-   B B B B B B X X X X X X ]
-   C C C C C C C X X X X ]
-   D D D D X X X X X X X ]]
-
-[[ E E E E E E E E ]
- [ F F F F X X X X ]
- [ G G G X X X X X ]
- [ H H H H X X X X ]]
-
-[[ I I I X X ]
- [ J J J X X ]
- [ K K K K K ]
- [ L L L X X ]]
-```
-
-w packing ( note it's the same effective number of tokens per step, but a true bsz of 1)
-```
-   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
-[[ A A A A A A A A A A A B B B B B
-   B C C C C C C C D D D D E E E E
-   E E E E F F F F F G G G H H H H
-   I I I J J J J K K K K K L L L X ]]
-```
--- a/docs/multipack.qmd
+++ b/docs/multipack.qmd
@@ -0,0 +1,76 @@
+---
+title: Multipack (Sample Packing)
+description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput.
+---
+
+## Visualization of Multipack with Flash Attention
+
+Because Flash Attention simply drops the attention mask, we do not need to
+construct a 4d attention mask. We only need to concatenate the sequences into
+a single batch and let flash attention know where each new sequence begins.
+
+
+4k context, bsz =4,
+each character represents 256 tokens
+X represents a padding token
+
+```
+   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+[[ A A A A A A A A A A A ]
+   B B B B B B ]
+   C C C C C C C ]
+   D D D D ]]
+
+[[ E E E E E E E E ]
+ [ F F F F ]
+ [ G G G ]
+ [ H H H H ]]
+
+[[ I I I ]
+ [ J J J ]
+ [ K K K K K]
+ [ L L L ]]
+```
+
+after padding to longest input in each step
+```
+   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+[[ A A A A A A A A A A A ]
+   B B B B B B X X X X X X ]
+   C C C C C C C X X X X ]
+   D D D D X X X X X X X ]]
+
+[[ E E E E E E E E ]
+ [ F F F F X X X X ]
+ [ G G G X X X X X ]
+ [ H H H H X X X X ]]
+
+[[ I I I X X ]
+ [ J J J X X ]
+ [ K K K K K ]
+ [ L L L X X ]]
+```
+
+w packing ( note it's the same effective number of tokens per step, but a true bsz of 1)
+```
+   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+[[ A A A A A A A A A A A B B B B B
+   B C C C C C C C D D D D E E E E
+   E E E E F F F F F G G G H H H H
+   I I I J J J J K K K K K L L L X ]]
+```
+
+cu_seqlens:
+[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]
+
+
+## Multipack without Flash Attention
+
+Multipack can still be achieved without Flash attention, but with lower packing
+efficiency as we are not able to join multiple batches into a single batch due to
+context length limits without flash attention. We can use either Pytorch's Scaled
+Dot Product Attention implementation or native Pytorch attention implementation
+along with [4d attention masks](https://github.com/huggingface/transformers/pull/27539)
+to pack sequences together and avoid cross attention.
+
+<img src="./images/4d-mask.png" alt="axolotl" width="800">
--- a/docs/nccl.qmd
+++ b/docs/nccl.qmd
@@ -1,4 +1,7 @@
-# NCCL
+---
+title: NCCL
+description: Troubleshooting NCCL issues
+---

 NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:

--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -1,4 +1,7 @@
-# RLHF (Beta)
+---
+title: "RLHF (Beta)"
+description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback."
+---

 ### Overview

@@ -12,21 +15,21 @@ feedback. Various methods include, but not limited to:

 ### RLHF using Axolotl

-[!IMPORTANT]
-This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.
+>[!IMPORTANT]
+>This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.

 The various RL training methods are implemented in trl and wrapped via axolotl. Below are various examples with how you can use various preference datasets to train models that use ChatML

 #### DPO
 ```yaml
-rl: true
+rl: dpo
 datasets:
  - path: Intel/orca_dpo_pairs
    split: train
-    type: intel_apply_chatml
+    type: chatml.intel
  - path: argilla/ultrafeedback-binarized-preferences
    split: train
-    type: argilla_apply_chatml
+    type: chatml.argilla
 ```

 #### IPO
@@ -34,6 +37,31 @@ datasets:
 rl: ipo
 ```

+#### ORPO
+
+Paper: https://arxiv.org/abs/2403.07691
+
+```yaml
+rl: orpo
+orpo_alpha: 0.1
+remove_unused_columns: false
+
+chat_template: chatml
+datasets:
+  - path: argilla/ultrafeedback-binarized-preferences-cleaned
+    type: orpo.chat_template
+```
+
+#### Using local dataset files
+```yaml
+datasets:
+  - ds_type: json
+    data_files:
+      - orca_rlhf.jsonl
+    split: train
+    type: chatml.intel
+```
+
 #### Trl autounwrap for peft

 Trl supports autounwrapping peft models, so that a ref model does not need to be additionally loaded, leading to less VRAM needed. This is on by default. To turn it off, pass the following config.
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -53,8 +53,8 @@ lr_quadratic_warmup: true
 learning_rate: 0.000085
 train_on_inputs: true
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: false
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -11,7 +11,6 @@ val_set_size: 0.05
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
-max_packed_sequence_len: 2048
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
@@ -36,8 +35,8 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -1,7 +1,6 @@
 base_model: codellama/CodeLlama-13b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -41,8 +40,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,6 +51,7 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -1,7 +1,6 @@
 base_model: codellama/CodeLlama-13b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
@@ -43,8 +42,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -1,7 +1,6 @@
 base_model: codellama/CodeLlama-34b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -41,8 +40,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,6 +51,7 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -1,7 +1,6 @@
 base_model: codellama/CodeLlama-34b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
@@ -43,8 +42,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -1,7 +1,6 @@
 base_model: codellama/CodeLlama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -41,8 +40,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,6 +51,7 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -1,7 +1,6 @@
 base_model: codellama/CodeLlama-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
@@ -43,8 +42,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -0,0 +1,216 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AKjdG7tbTb-n"
+      },
+      "source": [
+        "# Example notebook for running Axolotl on google colab"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RcbNpOgWRcii"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
+        "assert (torch.cuda.is_available()==True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "h3nLav8oTRA5"
+      },
+      "source": [
+        "## Install Axolotl and dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3c3yGAwnOIdi",
+        "outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install torch==\"2.1.2\"\n",
+        "!pip install -e git+https://github.com/OpenAccess-AI-Collective/axolotl#egg=axolotl\n",
+        "!pip install flash-attn==\"2.5.0\"\n",
+        "!pip install deepspeed==\"0.13.1\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BW2MFr7HTjub"
+      },
+      "source": [
+        "## Create an yaml config file"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9pkF2dSoQEUN"
+      },
+      "outputs": [],
+      "source": [
+        "import yaml\n",
+        "\n",
+        "# Your YAML string\n",
+        "yaml_string = \"\"\"\n",
+        "base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
+        "model_type: LlamaForCausalLM\n",
+        "tokenizer_type: LlamaTokenizer\n",
+        "is_llama_derived_model: true\n",
+        "\n",
+        "load_in_8bit: false\n",
+        "load_in_4bit: true\n",
+        "strict: false\n",
+        "\n",
+        "datasets:\n",
+        "  - path: mhenrichsen/alpaca_2k_test\n",
+        "    type: alpaca\n",
+        "dataset_prepared_path:\n",
+        "val_set_size: 0.05\n",
+        "output_dir: ./qlora-out\n",
+        "\n",
+        "adapter: qlora\n",
+        "lora_model_dir:\n",
+        "\n",
+        "sequence_len: 1096\n",
+        "sample_packing: true\n",
+        "pad_to_sequence_len: true\n",
+        "\n",
+        "lora_r: 32\n",
+        "lora_alpha: 16\n",
+        "lora_dropout: 0.05\n",
+        "lora_target_modules:\n",
+        "lora_target_linear: true\n",
+        "lora_fan_in_fan_out:\n",
+        "\n",
+        "wandb_project:\n",
+        "wandb_entity:\n",
+        "wandb_watch:\n",
+        "wandb_name:\n",
+        "wandb_log_model:\n",
+        "\n",
+        "mlflow_experiment_name: colab-example\n",
+        "\n",
+        "gradient_accumulation_steps: 1\n",
+        "micro_batch_size: 1\n",
+        "num_epochs: 4\n",
+        "max_steps: 20\n",
+        "optimizer: paged_adamw_32bit\n",
+        "lr_scheduler: cosine\n",
+        "learning_rate: 0.0002\n",
+        "\n",
+        "train_on_inputs: false\n",
+        "group_by_length: false\n",
+        "bf16: false\n",
+        "fp16: true\n",
+        "tf32: false\n",
+        "\n",
+        "gradient_checkpointing: true\n",
+        "early_stopping_patience:\n",
+        "resume_from_checkpoint:\n",
+        "local_rank:\n",
+        "logging_steps: 1\n",
+        "xformers_attention:\n",
+        "flash_attention: false\n",
+        "\n",
+        "warmup_steps: 10\n",
+        "evals_per_epoch:\n",
+        "saves_per_epoch:\n",
+        "debug:\n",
+        "deepspeed:\n",
+        "weight_decay: 0.0\n",
+        "fsdp:\n",
+        "fsdp_config:\n",
+        "special_tokens:\n",
+        "\n",
+        "\"\"\"\n",
+        "\n",
+        "# Convert the YAML string to a Python dictionary\n",
+        "yaml_dict = yaml.safe_load(yaml_string)\n",
+        "\n",
+        "# Specify your file path\n",
+        "file_path = 'test_axolotl.yaml'\n",
+        "\n",
+        "# Write the YAML file\n",
+        "with open(file_path, 'w') as file:\n",
+        "    yaml.dump(yaml_dict, file)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bidoj8YLTusD"
+      },
+      "source": [
+        "## Launch the training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ydTI2Jk2RStU",
+        "outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
+      },
+      "outputs": [],
+      "source": [
+        "# Buy using the ! the comand will be executed as a bash command\n",
+        "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Play with inference"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Buy using the ! the comand will be executed as a bash command\n",
+        "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
+        "    --qlora_model_dir=\"./qlora-out\" --gradio"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -2,7 +2,7 @@ base_model: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_falcon_derived_model: true
+
 load_in_8bit: true
 load_in_4bit: false
 gptq: false
@@ -38,8 +38,8 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
@@ -60,5 +60,5 @@ fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
-  bos_token: ">>ABSTRACT<<"
+  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -5,7 +5,7 @@ base_model: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_falcon_derived_model: true
+
 load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true
@@ -64,8 +64,8 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row
@@ -89,5 +89,5 @@ fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
-  bos_token: ">>ABSTRACT<<"
+  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -2,7 +2,7 @@ base_model: tiiuae/falcon-7b
 trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_falcon_derived_model: true
+
 load_in_8bit: false
 load_in_4bit: false
 gptq: false
@@ -38,8 +38,8 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
@@ -60,5 +60,5 @@ fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
-  bos_token: ">>ABSTRACT<<"
+  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -0,0 +1,66 @@
+# use google/gemma-7b if you have access
+base_model: mhenrichsen/gemma-7b
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+# huggingface repo
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+val_set_size: 0.1
+output_dir: ./out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+sequence_len: 4096
+sample_packing: true
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 3
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -33,8 +33,8 @@ lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
--- a/examples/jamba/README.md
+++ b/examples/jamba/README.md
@@ -0,0 +1,10 @@
+# Jamba
+
+- ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and
+  - 35GiB VRAM per GPU w minimal context length
+  - 56GiB VRAM per GPU (w multipack enabled)
+- ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
+- ✅ qlora single-gpu, ~51GiB VRAM
+- ✅ multipack
+- ❓ FSDP
+- ❓ 8-bit LoRA
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -0,0 +1,62 @@
+base_model: ai21labs/Jamba-v0.1
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./out
+
+sequence_len: 4096
+sample_packing: false
+pad_to_sequence_len: false
+eval_sample_packing: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: qlora
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+low_cpu_mem_usage: true
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 2
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.00001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+special_tokens:
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -0,0 +1,62 @@
+base_model: ai21labs/Jamba-v0.1
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.0
+output_dir: ./out
+
+sequence_len: 4096
+sample_packing: false
+pad_to_sequence_len: false
+eval_sample_packing: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+adapter: qlora
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+low_cpu_mem_usage: true
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 2
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.00001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch:
+saves_per_epoch: 1
+debug:
+deepspeed: deepspeed_configs/zero2.json
+weight_decay: 0.0
+special_tokens:
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -31,7 +31,7 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -1,7 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: false
@@ -41,8 +40,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -62,11 +61,8 @@ evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
-deepspeed: #deepspeed/zero2.json # multi-gpu only
+deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -1,5 +1,4 @@
 base_model: TheBloke/Llama-2-7B-GPTQ
-is_llama_derived_model: false
 gptq: true
 gptq_disable_exllama: true
 model_type: AutoModelForCausalLM
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -0,0 +1,75 @@
+base_model: NousResearch/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./lisa-out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter:
+lora_model_dir:
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_linear:
+lora_fan_in_fan_out:
+
+lisa_n_layers: 2
+lisa_step_interval: 20
+lisa_layers_attribute: model.layers
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 5e-5 # recommendation from lisa paper for 7b
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+flash_attn_cross_entropy: false
+flash_attn_rms_norm: true
+flash_attn_fuse_qkv: false
+flash_attn_fuse_mlp: true
+
+warmup_steps: 100
+evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -0,0 +1,69 @@
+base_model: NousResearch/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./lora-out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+peft:
+  loftq_config:
+    loftq_bits: 4
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -1,7 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -41,8 +40,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,11 +51,12 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
@@ -64,6 +64,3 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -0,0 +1,76 @@
+base_model: NousResearch/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: yahma/alpaca-cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 512
+sample_packing: false
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 4
+num_epochs: 4
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 0.00001
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+  - full_shard
+fsdp_config:
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+special_tokens:
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -1,7 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
@@ -43,8 +42,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -65,6 +64,3 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -1,7 +1,7 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true
+

 load_in_8bit: false
 load_in_4bit: true
@@ -47,8 +47,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -34,8 +34,8 @@ learning_rate: 5e-5
 train_on_inputs: false
 group_by_length: true

-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: false
@@ -49,7 +49,7 @@ flash_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/mistral/README.md
+++ b/examples/mistral/README.md
@@ -8,5 +8,5 @@ accelerate launch -m axolotl.cli.train examples/mistral/config.yml

 If you run into CUDA OOM, use deepspeed with config zero2.json:
 ```shell
-accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed/zero2.json
+accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed_configs/zero2.json
 ```
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -1,7 +1,6 @@
 base_model: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
-is_mistral_derived_model: true

 load_in_8bit: false
 load_in_4bit: false
@@ -34,8 +33,8 @@ learning_rate: 0.000005

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -49,7 +48,7 @@ flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
@@ -57,6 +56,3 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -0,0 +1,79 @@
+base_model: mistralai/Mistral-7B-v0.1
+model_type: MistralForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./lora-out
+eval_sample_packing: false
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 2
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: true
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: false
+sdp_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_table_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -0,0 +1,77 @@
+base_model: mistralai/Mistral-7B-v0.1
+model_type: MistralForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./lora-out
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -0,0 +1,74 @@
+base_model: mistralai/Mixtral-8x7B-v0.1
+model_type: AutoModelForCausalLM
+tokenizer_type: LlamaTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.02
+output_dir: ./qlora-out
+
+model_config:
+  output_router_logits: true
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 1024
+sample_packing: false
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+loss_watchdog_threshold: 5.0
+loss_watchdog_patience: 3
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+weight_decay: 0.0
+fsdp:
+  - full_shard
+fsdp_config:
+  fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
+special_tokens:
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -16,12 +16,12 @@ output_dir: ./qlora-out

 ## You can optionally freeze the entire model and unfreeze a subset of parameters
 unfrozen_parameters:
-#  - lm_head.*
-#  - model.embed_tokens.*
-#  - model.layers.2[0-9]+.block_sparse_moe.gate.*
-#  - model.layers.2[0-9]+.block_sparse_moe.experts.*
-#  - model.layers.3[0-9]+.block_sparse_moe.gate.*
-#  - model.layers.3[0-9]+.block_sparse_moe.experts.*
+#  - ^lm_head.weight$
+#  - ^model.embed_tokens.weight$[:32000]
+#  - model.layers.2[0-9]+.block_sparse_moe.gate
+#  - model.layers.2[0-9]+.block_sparse_moe.experts
+#  - model.layers.3[0-9]+.block_sparse_moe.gate
+#  - model.layers.3[0-9]+.block_sparse_moe.experts

 model_config:
  output_router_logits: true
@@ -63,8 +63,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -81,10 +81,10 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
-deepspeed: deepspeed/zero2.json
+deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -1,7 +1,6 @@
 base_model: mistralai/Mistral-7B-v0.1
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
-is_mistral_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
@@ -50,8 +49,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -68,7 +67,7 @@ loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
@@ -76,6 +75,3 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -33,7 +33,7 @@ lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -52,6 +52,7 @@ logging_steps: 1
 xformers_attention:
 flash_attention: true
 gptq_groupsize:
+s2_attention:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
--- a/examples/phi/README.md
+++ b/examples/phi/README.md
@@ -3,7 +3,7 @@
 Due to some nuances with the phi code, please use deepspeed when training phi for full finetune.

 ```shell
-accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed/zero1.json
+accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed_configs/zero1.json

 # OR

--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-1_5
-model_type: PhiForCausalLM
+model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_llama_derived_model: false
-trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: false
@@ -18,7 +16,7 @@ output_dir: ./phi-sft-out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len:
+pad_to_sequence_len: true

 adapter:
 lora_model_dir:
@@ -35,7 +33,7 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_torch
 adam_beta2: 0.95
@@ -45,18 +43,20 @@ lr_scheduler: cosine
 learning_rate: 0.000003

 train_on_inputs: false
-group_by_length: true
-bf16: true
-fp16: false
+group_by_length: false
+bf16: auto
+fp16:
 tf32: true

-gradient_checkpointing:
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention:
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
@@ -68,7 +68,4 @@ fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
-  bos_token: "<|endoftext|>"
-  eos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-1_5
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_llama_derived_model: false
-trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: true
@@ -16,9 +14,9 @@ dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./phi-sft-out

-sequence_len: 1024
-sample_packing: false  # not CURRENTLY compatible with LoRAs
-pad_to_sequence_len:
+sequence_len: 2048
+sample_packing: true
+pad_to_sequence_len: true

 adapter: qlora
 lora_model_dir:
@@ -35,7 +33,7 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_torch
 adam_beta2: 0.95
@@ -45,18 +43,20 @@ lr_scheduler: cosine
 learning_rate: 0.000003

 train_on_inputs: false
-group_by_length: true
-bf16: true
-fp16: false
+group_by_length: false
+bf16: auto
+fp16:
 tf32: true

-gradient_checkpointing:
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention:
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
@@ -68,7 +68,4 @@ fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
-  bos_token: "<|endoftext|>"
-  eos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-2
-model_revision:  834565c  # pin model repo to the previous architecture
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: false
@@ -17,19 +15,16 @@ val_set_size: 0.05
 output_dir: ./phi-sft-out

 sequence_len: 2048
-sample_packing: false  # currently unsupported
-pad_to_sequence_len:
+sample_packing: true
+pad_to_sequence_len: true

 adapter:
 lora_model_dir:
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.1
-lora_target_linear: true
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_linear:
 lora_fan_in_fan_out:
-lora_modules_to_save:
-  - embd
-  - lm_head

 wandb_project:
 wandb_entity:
@@ -38,22 +33,24 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
-optimizer: paged_adamw_8bit
+optimizer: adamw_torch
 adam_beta2: 0.95
 adam_epsilon: 0.00001
 max_grad_norm: 1.0
 lr_scheduler: cosine
-learning_rate: 1e-5
+learning_rate: 0.000003

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/examples/pythia/lora.yml
+++ b/examples/pythia/lora.yml
@@ -27,7 +27,7 @@ num_epochs: 4
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/qwen/README.md
+++ b/examples/qwen/README.md
@@ -0,0 +1,10 @@
+# Qwen
+
+TODO
+
+# Qwen2 MoE
+
+✅ multipack
+✅ qwen2_moe 4-bit QLoRA
+✅ qwen2_moe 16-bit LoRA
+❓ qwen2_moe 8-bit LoRA
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -2,7 +2,6 @@ base_model: Qwen/Qwen-7B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

-is_qwen_derived_model: true
 trust_remote_code: true

 load_in_8bit: true
@@ -43,8 +42,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false
@@ -58,7 +57,7 @@ flash_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -2,7 +2,6 @@ base_model: Qwen/Qwen-7B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

-is_qwen_derived_model: true
 trust_remote_code: true

 load_in_8bit: false
@@ -43,8 +42,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false
@@ -58,7 +57,7 @@ flash_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/qwen/qwen2-moe-lora.yaml
@@ -0,0 +1,64 @@
+base_model: Qwen/Qwen1.5-MoE-A2.7B
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./out
+
+sequence_len: 1024  # supports up to 32k
+sample_packing: false
+pad_to_sequence_len: false
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/qwen/qwen2-moe-qlora.yaml
@@ -0,0 +1,64 @@
+base_model: Qwen/Qwen1.5-MoE-A2.7B
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./out
+
+sequence_len: 1024  # supports up to 32k
+sample_packing: false
+pad_to_sequence_len: false
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: paged_adamw_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -34,7 +34,7 @@ lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/replit-3b/config-lora.yml
+++ b/examples/replit-3b/config-lora.yml
@@ -33,7 +33,7 @@ lr_scheduler:
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 gradient_checkpointing:
 early_stopping_patience:
--- a/examples/stablelm-2/1.6b/fft.yml
+++ b/examples/stablelm-2/1.6b/fft.yml
@@ -0,0 +1,69 @@
+base_model: stabilityai/stablelm-2-1_6b
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter:
+lora_model_dir:
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_linear:
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+flash_attn_cross_entropy: false
+flash_attn_rms_norm: true
+flash_attn_fuse_qkv: false
+flash_attn_fuse_mlp: true
+
+warmup_steps: 100
+evals_per_epoch: 4
+eval_table_size:
+saves_per_epoch: 1
+debug:
+deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/stablelm-2/1.6b/lora.yml
+++ b/examples/stablelm-2/1.6b/lora.yml
@@ -0,0 +1,66 @@
+base_model: stabilityai/stablelm-2-1_6b
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+trust_remote_code: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./lora-out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+flash_attn_cross_entropy: false
+flash_attn_rms_norm: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/stablelm-2/README.md
+++ b/examples/stablelm-2/README.md
@@ -0,0 +1,36 @@
+# StableLM 2
+
+This repository contains examples for training and processing using StableLM-2. It also includes a section to help you estimate the GPU requirements for your specific use case.
+
+## Estimating GPU Requirements
+
+| type          | deepspeed | batch size | context length | vRAM GPU (GBs) |
+|---------------|-----------|------------|----------------|----------------|
+| full finetune | N/A       | 1          | 4096           | ~21.5GBs       |
+| full finetune | zero2     | 1          | 4096           | ~20GBs         |
+| lora          | N/A       | 1          | 4096           | ~16.6GBs       |
+
+The above are estimates and might differ slight depending on the setup for example whether you pack your sequence lengths or not (the above assumes you do to length 4096).
+
+This blog post from Hamel Husain was a great resource for estimating these numbers: https://hamel.dev/notes/llm/03_estimating_vram.html
+
+## Training
+We have example scripts here for both full finetuning and lora using the popular alpaca dataset:
+
+```shell
+# preprocess the dataset
+CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/stablelm-2/1.6b/lora.yml
+```
+
+Single GPU Training:
+```shell
+python -m axolotl.cli.train examples/stablelm-2/fft.yml --deepspeed deepspeed_configs/zero2.json
+# OR
+python -m axolotl.cli.train examples/stablelm-2/1.6b/lora.yml
+```
+
+Multinode GPU Training with `accelerate`:
+```shell
+# make sure you've configured accelerate properly
+accelerate launch -m axolotl.cli.train examples/stablelm-2/1.6b/fft.yml --deepspeed deepspeed_configs/zero2.json
+```
--- a/examples/starcoder2/qlora.yml
+++ b/examples/starcoder2/qlora.yml
@@ -0,0 +1,69 @@
+base_model: bigcode/starcoder2-3b
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+
+
+dataset_prepared_path:
+val_set_size: 0.2
+output_dir: ./qlora
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 8192
+sample_packing: true
+pad_to_sequence_len: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 2e-5
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 20
+evals_per_epoch: 4
+eval_steps:
+eval_table_size:
+saves_per_epoch: 4
+save_steps:
+save_total_limit: 2
+debug:
+deepspeed:
+weight_decay:
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/tiny-llama/lora-mps.yml
+++ b/examples/tiny-llama/lora-mps.yml
@@ -0,0 +1,64 @@
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0
+output_dir: ./lora-out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+eval_sample_packing: false
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: true
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: false
+
+warmup_steps: 10
+evals_per_epoch: 0
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/tiny-llama/lora.yml
+++ b/examples/tiny-llama/lora.yml
@@ -1,7 +1,6 @@
 base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: true
 load_in_4bit: false
@@ -16,6 +15,7 @@ output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
+eval_sample_packing: false
 pad_to_sequence_len: true

 adapter: lora
@@ -41,8 +41,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/tiny-llama/pretrain.yml
+++ b/examples/tiny-llama/pretrain.yml
@@ -2,7 +2,6 @@ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0

 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: false
@@ -12,6 +11,7 @@ max_steps: 200
 pretraining_dataset:
  path: c4
  name: en
+  type: pretrain
 dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./model-out
@@ -34,8 +34,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/tiny-llama/qlora.yml
+++ b/examples/tiny-llama/qlora.yml
@@ -1,7 +1,6 @@
 base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true

 load_in_8bit: false
 load_in_4bit: true
@@ -43,8 +42,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -62,8 +62,8 @@ lr_scheduler: cosine
 learning_rate: 0.00002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row
--- a/examples/yi-34B-chat/qlora.yml
+++ b/examples/yi-34B-chat/qlora.yml
@@ -1,14 +1,13 @@
 base_model: 01-ai/Yi-34B-Chat
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
-is_mistral_derived_model: false
-is_llama_derived_model: true
+
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 sequence_len: 1024
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 flash_attention: true
 special_tokens:
@@ -29,7 +28,7 @@ num_epochs: 1
 val_set_size: 0.1
 evals_per_epoch: 5
 eval_table_size:
-eval_table_max_new_tokens: 128
+eval_max_new_tokens: 128
 eval_sample_packing: false
 eval_batch_size: 1

--- a/favicon.jpg
+++ b/favicon.jpg
--- a/Show More
+++ b/Show More