changed yml

fixed yml for issue1947 testing
added yml for testing issue 1947
2024-10-18 10:46:36 -04:00 · 2024-10-17 12:31:11 -04:00 · 2024-10-17 11:26:32 -04:00 · 2024-10-17 11:23:47 -04:00
306 changed files with 6200 additions and 14075 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -1,16 +1,6 @@
 name: ci-cd-base
 on:
  push:
    branches:
      - "main"
    paths:
      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:
 jobs:
@@ -34,12 +24,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.3.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.10"
            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
@@ -50,25 +34,23 @@ jobs:
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Docker metadata
        id: metadata
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v3
        with:
-          images: |
+          images: winglian/axolotl-base
            winglian/axolotl-base
            axolotlai/axolotl-base
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v4
        with:
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -17,7 +17,7 @@ jobs:
        - name: Set up Quarto
          uses: quarto-dev/quarto-actions/setup@v2
        - name: Setup Python
-          uses: actions/setup-python@v5
+          uses: actions/setup-python@v3
          with:
            python-version: '3.10'
        - name: install dependencies
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,7 +1,6 @@
 name: lint
 on:
  # check on PRs, and manual triggers
  merge_group:
  pull_request:
      paths:
       - '**.py'
@@ -16,9 +15,9 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v4
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
+      - uses: pre-commit/action@v3.0.0
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,13 +4,11 @@ on:
  push:
    branches:
      - "main"
    tags:
      - "v*"
  workflow_dispatch:
 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -25,17 +23,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.3.1
            axolotl_extras: mamba-ssm
            is_latest: true
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -44,12 +37,7 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
+          images: winglian/axolotl
            winglian/axolotl
            axolotlai/axolotl
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
@@ -63,7 +51,7 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
@@ -77,7 +65,7 @@ jobs:
  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -92,17 +80,12 @@ jobs:
            python_version: "3.11"
            pytorch: 2.3.1
            axolotl_extras:
            is_latest: true
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
            is_latest: true
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -111,25 +94,20 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
+          images: winglian/axolotl-cloud
            winglian/axolotl-cloud
            axolotlai/axolotl-cloud
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
@@ -140,7 +118,7 @@ jobs:
  build-axolotl-cloud-no-tmux:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -158,25 +136,20 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
+          images: winglian/axolotl-cloud-term
            winglian/axolotl-cloud-term
            axolotlai/axolotl-cloud-term
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud-no-tmux
          push: ${{ github.event_name != 'pull_request' }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,14 +8,9 @@ on:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -26,17 +21,10 @@ jobs:
            pytorch: 2.3.1
            axolotl_extras:
            num_gpus: 2
-          - cuda: 124
+          - cuda: 121
-            cuda_version: 12.4.1
+            cuda_version: 12.1.1
            python_version: "3.11"
-            pytorch: 2.4.1
+            pytorch: 2.3.1
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
@@ -52,7 +40,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==0.63.64 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -7,7 +7,7 @@ on:
 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
@@ -28,11 +28,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -41,9 +36,7 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
+          images: winglian/axolotl
            winglian/axolotl
            axolotlai/axolotl
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Set up Docker Buildx
@@ -71,7 +64,7 @@ jobs:
  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
@@ -92,11 +85,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -105,9 +93,7 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
+          images: winglian/axolotl-cloud
            winglian/axolotl-cloud
            axolotlai/axolotl-cloud
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Login to Docker Hub
@@ -116,7 +102,7 @@ jobs:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -3,27 +3,12 @@ name: publish pypi
 on:
  push:
    tags:
-      - 'v*'
+      - '*'
  workflow_dispatch:
 jobs:
  setup_release:
    name: Create Release
    runs-on: ubuntu-latest
    permissions:
      contents: write
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Create release
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: gh release create "$GITHUB_REF_NAME" --generate-notes
  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
    needs: [setup_release]
    environment:
      name: pypi
      url: https://pypi.org/p/axolotl
@@ -31,18 +16,18 @@ jobs:
      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: "3.10"
      - name: Install dependencies
        run: |
          pip3 install wheel packaging
-          pip3 install --no-build-isolation -e .
+          pip3 install -e .
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          pip3 install -r requirements-tests.txt
      - name: Extract tag name
        id: tag
@@ -52,9 +37,9 @@ jobs:
        run: |
          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
-      - name: Build a source dist
+      - name: Build a binary wheel
        run: |
-          python setup.py sdist
+          python setup.py sdist bdist_wheel
      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -9,12 +9,12 @@ jobs:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v4
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
+      - uses: pre-commit/action@v3.0.0
        env:
          SKIP: no-commit-to-branch
@@ -23,32 +23,21 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
+        pytorch_version: ["2.3.1", "2.4.1"]
        exclude:
          - python_version: "3.10"
            pytorch_version: "2.4.1"
          - python_version: "3.10"
            pytorch_version: "2.5.1"
    timeout-minutes: 20
    steps:
      - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging setuptools wheel
      - name: Install PyTorch
        run: |
          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
@@ -58,30 +47,17 @@ jobs:
          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt
      - name: Install dependencies
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging
-          pip3 install --no-build-isolation -U -e .
+          pip3 install -U -e .
-          python scripts/unsloth_install.py | sh
+          pip3 install -r requirements-tests.txt
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help
      - name: Run tests
        run: |
-          pytest -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest --ignore=tests/e2e/ tests/
          pytest tests/patched/
      - name: cleanup pip cache
        run: |
@@ -105,17 +81,17 @@ jobs:
            num_gpus: 1
            axolotl_extras: mamba-ssm
            nightly_build: "true"
-          - cuda: 124
+          - cuda: 121
-            cuda_version: 12.4.1
+            cuda_version: 12.1.1
            python_version: "3.11"
-            pytorch: 2.4.1
+            pytorch: 2.3.1
            num_gpus: 1
-            axolotl_extras:
+            axolotl_extras: mamba-ssm
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
@@ -129,7 +105,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==0.63.64 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,7 +1,6 @@
 name: Tests
 on:
  # check on push/merge to main, PRs, and manual triggers
  merge_group:
  push:
    branches:
      - "main"
@@ -9,35 +8,24 @@ on:
      - '**.py'
      - 'requirements.txt'
      - '.github/workflows/*.yml'
      - 'requirements-tests.txt'
      - 'cicd/cicd.sh'
      - 'cicd/Dockerfile.jinja'
  pull_request:
      paths:
       - '**.py'
       - 'requirements.txt'
       - '.github/workflows/*.yml'
       - 'requirements-tests.txt'
       - 'cicd/cicd.sh'
       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:
 # Cancel jobs on the same ref if a new one is triggered
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v4
        with:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
+      - uses: pre-commit/action@v3.0.0
        env:
          SKIP: no-commit-to-branch
@@ -46,164 +34,63 @@ jobs:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1", "2.5.1"]
+        pytorch_version: ["2.3.1", "2.4.1"]
        exclude:
          - python_version: "3.10"
            pytorch_version: "2.4.1"
          - python_version: "3.10"
            pytorch_version: "2.5.1"
    timeout-minutes: 20
    steps:
      - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Restore HF cache
        id: hf-cache-restore
        uses: actions/cache/restore@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging setuptools wheel
      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
+          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
      - name: Install dependencies
        run: |
-          pip3 show torch
+          pip3 install --upgrade pip
-          pip3 install --no-build-isolation -U -e .
+          pip3 install --upgrade packaging
-          python scripts/unsloth_install.py | sh
+          pip3 install -U -e .
-          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-tests.txt
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help
      - name: Run tests
        run: |
-          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
+          pytest --ignore=tests/e2e/ tests/
          pytest -v tests/patched/
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-      - name: Save HF cache
+  docker-e2e-tests:
-        id: hf-cache
+    if: github.repository_owner == 'axolotl-ai-cloud'
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.4.1", "2.5.1"]
    timeout-minutes: 20
    steps:
      - name: Check out repository code
        uses: actions/checkout@v4
      - name: Restore HF cache
        id: hf-cache-restore
        uses: actions/cache/restore@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging setuptools setuptools_scm build wheel
      - name: Install PyTorch
        run: |
          pip3 install torch==${{ matrix.pytorch_version }}
      - name: Install dependencies
        run: |
          pip3 show torch
          python -m build --no-isolation --sdist
          pip3 install --no-build-isolation dist/axolotl*.tar.gz
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ tests/
          pytest -v tests/patched/
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  docker-e2e-tests-1st:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
+    timeout-minutes: 60
-    needs: [pre-commit, pytest, pytest-sdist]
+    needs: [pre-commit, pytest]
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 121
            cuda_version: 12.1.1
            python_version: "3.10"
            pytorch: 2.3.1
            num_gpus: 1
            axolotl_extras: mamba-ssm
          - cuda: 121
            cuda_version: 12.1.1
            python_version: "3.11"
            pytorch: 2.3.1
            num_gpus: 1
            axolotl_extras: mamba-ssm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -220,53 +107,7 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.71.8 jinja2
+          pip install modal==0.63.64 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.tests
  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
    needs: [pre-commit, pytest, docker-e2e-tests-1st]
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 121
            cuda_version: 12.1.1
            python_version: "3.10"
            pytorch: 2.3.1
            num_gpus: 1
            axolotl_extras: mamba-ssm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.10"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==0.71.8 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,6 @@
 **/axolotl.egg-info
 configs
 last_run_prepared/
 outputs
 .vscode
 _site/
@@ -183,6 +182,3 @@ submit.sh
 typings/
 out/
 # vim
 *.swp
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -23,7 +23,7 @@ repos:
    hooks:
    - id: flake8
 -   repo: https://github.com/PyCQA/pylint
-    rev: v3.3.0
+    rev: v2.17.4
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
--- a/.pylintrc
+++ b/.pylintrc
@@ -1,5 +1,5 @@
 [MASTER]
-init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"
+init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"
 [TYPECHECK]
@@ -12,4 +12,3 @@ generated-members=numpy.*, torch.*
 disable=missing-function-docstring, line-too-long, import-error,
    too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
    too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
    too-many-positional-arguments, possibly-used-before-assignment
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +0,0 @@
 include requirements.txt
 include README.md
 include LICENSE
 include src/setuptools_axolotl_dynamic_dependencies.py
 recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
@@ -1,25 +1,8 @@
-<p align="center">
+# Axolotl
    <picture>
        <source media="(prefers-color-scheme: dark)" srcset="image/axolotl_logo_digital_white.svg">
        <source media="(prefers-color-scheme: light)" srcset="image/axolotl_logo_digital_black.svg">
        <img alt="Axolotl" src="image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
    </picture>
 </p>
-<p align="center">
+![tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg)
-    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
+![tests-nightly](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg)
-    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
+![multigpu-semi-weekly tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg)
    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
    <br/>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
    <img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
    <br/>
    <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
    <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
    <br/>
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
 </p>
 Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.
@@ -45,13 +28,9 @@ Features:
 ## Table of Contents
 - [Axolotl](#axolotl)
  - [Table of Contents](#table-of-contents)
  - [Quickstart ⚡](#quickstart-)
    - [Edge Builds](#edge-builds-)
    - [Axolotl CLI Usage](#axolotl-cli-usage)
  - [Badge ❤🏷️](#badge-️)
  - [Contributing 🤝](#contributing-)
  - [Sponsors 🤝❤](#sponsors-)
  - [Axolotl supports](#axolotl-supports)
  - [Quickstart ⚡](#quickstart-)
    - [Usage](#usage)
  - [Advanced Setup](#advanced-setup)
    - [Environment](#environment)
      - [Docker](#docker)
@@ -83,12 +62,20 @@ Features:
    - [Tokenization Mismatch b/w Inference \& Training](#tokenization-mismatch-bw-inference--training)
  - [Debugging Axolotl](#debugging-axolotl)
  - [Need help? 🙋](#need-help-)
  - [Badge ❤🏷️](#badge-️)
  - [Community Showcase](#community-showcase)
  - [Contributing 🤝](#contributing-)
  - [Sponsors 🤝❤](#sponsors-)
      - [💎 Diamond Sponsors - Contact directly](#-diamond-sponsors---contact-directly)
      - [🥇 Gold Sponsors - $5000/mo](#-gold-sponsors---5000mo)
      - [🥈 Silver Sponsors - $1000/mo](#-silver-sponsors---1000mo)
      - [🥉 Bronze Sponsors - $500/mo](#-bronze-sponsors---500mo)
 </td>
 <td>
 <div align="center">
-  <img src="image/axolotl_symbol_digital_white.svg" alt="axolotl" width="160">
+  <img src="image/axolotl.png" alt="axolotl" width="160">
  <div>
    <p>
      <b>Axolotl provides a unified repository for fine-tuning <br />a variety of AI models with ease</b>
@@ -105,148 +92,6 @@ Features:
 </tr>
 </table>
 ## Quickstart ⚡
 Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.
 **Requirements**: *Nvidia* GPU (Ampere architecture or newer for `bf16` and Flash Attention) or *AMD* GPU, Python >=3.10 and PyTorch >=2.3.1.
 ```bash
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 # download examples and optionally deepspeed configs to the local path
 axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 # finetune using lora
 axolotl train examples/llama-3/lora-1b.yml
 ```
 ### Edge Builds 🏎️
 If you're looking for the latest features and updates between releases, you'll need to install
 from source.
 ```bash
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 pip3 install packaging ninja
 pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```
 ### Axolotl CLI Usage
 We now support a new, more streamlined CLI using [click](https://click.palletsprojects.com/en/stable/).
 ```bash
 # preprocess datasets - optional but recommended
 CUDA_VISIBLE_DEVICES="0" axolotl preprocess examples/llama-3/lora-1b.yml
 # finetune lora
 axolotl train examples/llama-3/lora-1b.yml
 # inference
 axolotl inference examples/llama-3/lora-1b.yml \
    --lora-model-dir="./outputs/lora-out"
 # gradio
 axolotl inference examples/llama-3/lora-1b.yml \
    --lora-model-dir="./outputs/lora-out" --gradio
 # remote yaml files - the yaml config can be hosted on a public URL
 # Note: the yaml config must directly link to the **raw** yaml
 axolotl train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml
 ```
 We've also added a new command for fetching `examples` and `deepspeed_configs` to your
 local machine. This will come in handy when installing `axolotl` from PyPI.
 ```bash
 # Fetch example YAML files (stores in "examples/" folder)
 axolotl fetch examples
 # Fetch deepspeed config files (stores in "deepspeed_configs/" folder)
 axolotl fetch deepspeed_configs
 # Optionally, specify a destination folder
 axolotl fetch examples --dest path/to/folder
 ```
 ### Legacy Usage
 <details>
 <summary>Click to Expand</summary>
 While the Axolotl CLI is the preferred method for interacting with axolotl, we
 still support the legacy `-m axolotl.cli.*` usage.
 ```bash
 # preprocess datasets - optional but recommended
 CUDA_VISIBLE_DEVICES="0" python -m axolotl.cli.preprocess examples/llama-3/lora-1b.yml
 # finetune lora
 accelerate launch -m axolotl.cli.train examples/llama-3/lora-1b.yml
 # inference
 accelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \
    --lora_model_dir="./outputs/lora-out"
 # gradio
 accelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \
    --lora_model_dir="./outputs/lora-out" --gradio
 # remote yaml files - the yaml config can be hosted on a public URL
 # Note: the yaml config must directly link to the **raw** yaml
 accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml
 ```
 </details>
 ## Badge ❤🏷️
 Building something cool with Axolotl? Consider adding a badge to your model card.
 ```markdown
 [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
 ```
 [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
 ## Sponsors 🤝❤
 If you love axolotl, consider sponsoring the project by reaching out directly to [wing@axolotl.ai](mailto:wing@axolotl.ai).
 ---
 - [Modal](https://modal.com/) Modal lets you run data/AI jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale, fine-tune LLM models, run protein folding simulations, and much more.
 ---
 ## Contributing 🤝
 Please read the [contributing guide](./.github/CONTRIBUTING.md)
 Bugs? Please check the [open issues](https://github.com/axolotl-ai-cloud/axolotl/issues/bug) else create a new Issue.
 PRs are **greatly welcome**!
 Please run the quickstart instructions followed by the below to setup env:
 ```bash
 pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install
 # test
 pytest tests/
 # optional: run against all files
 pre-commit run --all-files
 ```
 Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
 <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
 </a>
 ## Axolotl supports
 |             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
@@ -272,6 +117,41 @@ Thanks to all of our contributors to date. Help drive open source AI progress fo
 ❌: not supported
 ❓: untested
 ## Quickstart ⚡
 Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.
 **Requirements**: Python >=3.10 and Pytorch >=2.1.1.
 ```bash
 git clone https://github.com/axolotl-ai-cloud/axolotl
 cd axolotl
 pip3 install packaging ninja
 pip3 install -e '.[flash-attn,deepspeed]'
 ```
 ### Usage
 ```bash
 # preprocess datasets - optional but recommended
 CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/openllama-3b/lora.yml
 # finetune lora
 accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
 # inference
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
    --lora_model_dir="./outputs/lora-out"
 # gradio
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
    --lora_model_dir="./outputs/lora-out" --gradio
 # remote yaml files - the yaml config can be hosted on a public URL
 # Note: the yaml config must directly link to the **raw** yaml
 accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/openllama-3b/lora.yml
 ```
 ## Advanced Setup
 ### Environment
@@ -279,7 +159,7 @@ Thanks to all of our contributors to date. Help drive open source AI progress fo
 #### Docker
  ```bash
-  docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
+  docker run --gpus '"all"' --rm -it winglian/axolotl:main-latest
  ```
  Or run on the current files for development:
@@ -298,7 +178,7 @@ Thanks to all of our contributors to date. Help drive open source AI progress fo
  A more powerful Docker command to run would be this:
  ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-latest
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-latest
  ```
  It additionally:
@@ -320,7 +200,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
  3. Install Axolotl along with python dependencies
        ```bash
        pip3 install packaging
-        pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+        pip3 install -e '.[flash-attn,deepspeed]'
        ```
  4. (Optional) Login to Huggingface to use gated models/datasets.
        ```bash
@@ -330,7 +210,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
 #### Cloud GPU
-For cloud GPU providers that support docker images, use [`axolotlai/axolotl-cloud:main-latest`](https://hub.docker.com/r/axolotlai/axolotl-cloud/tags)
+For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)
 - on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
 - on JarvisLabs.ai use this [direct link](https://jarvislabs.ai/templates/axolotl)
@@ -399,7 +279,7 @@ Please use WSL or Docker!
 Use the below instead of the install method in QuickStart.
 ```
-pip3 install --no-build-isolation -e '.'
+pip3 install -e '.'
 ```
 More info: [mac.md](/docs/mac.qmd)
@@ -439,7 +319,7 @@ Write a job description in YAML as below:
 # dstack.yaml
 type: task
-image: axolotlai/axolotl-cloud:main-latest
+image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2
 env:
  - HUGGING_FACE_HUB_TOKEN
@@ -503,10 +383,11 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
        - typescript
      type: ... # unimplemented custom format
-      # chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template
+      # fastchat conversation (deprecation soon, use chat_template)
      # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    - path: ...
-      type: chat_template
+      type: sharegpt
-      chat_template: chatml # defaults to tokenizer's chat_template
+      conversation: chatml # default: vicuna_v1.1
      # local
    - path: data.jsonl # or json
@@ -519,8 +400,8 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
      train_on_split: validation
      # loading from s3 or gcs
-      # s3 creds will be loaded from the system default / gcs will attempt to load from gcloud creds, google metadata service, or anon
+      # s3 creds will be loaded from the system default and gcs only supports public access
-    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above
+    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
      ...
      # Loading Data From a Public URL
@@ -681,8 +562,7 @@ plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
-liger_glu_activation: true
+liger_swiglu: true
 liger_layer_norm: true
 liger_fused_linear_cross_entropy: true
 ```
@@ -789,6 +669,86 @@ See [this debugging guide](docs/debugging.qmd) for tips on debugging Axolotl, al
 ## Need help? 🙋
-Join our [Discord server](https://discord.gg/HhrNrHJPRb) where our community members can help you.
+Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we our community members can help you.
-Need dedicated support? Please contact us at [✉️wing@axolotl.ai](ailto:wing@axolotl.ai) for dedicated support options.
+Need dedicated support? Please contact us at [✉️wing@openaccessaicollective.org](mailto:wing@openaccessaicollective.org) for dedicated support options.
 ## Badge ❤🏷️
 Building something cool with Axolotl? Consider adding a badge to your model card.
 ```markdown
 [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
 ```
 [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
 ## Community Showcase
 Check out some of the projects and models that have been built using Axolotl! Have a model you'd like to add to our Community Showcase? Open a PR with your model.
 Open Access AI Collective
 - [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b-fixed)
 - [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
 - [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)
 PocketDoc Labs
 - [Dan's PersonalityEngine 13b LoRA](https://huggingface.co/PocketDoc/Dans-PersonalityEngine-13b-LoRA)
 ## Contributing 🤝
 Please read the [contributing guide](./.github/CONTRIBUTING.md)
 Bugs? Please check the [open issues](https://github.com/axolotl-ai-cloud/axolotl/issues/bug) else create a new Issue.
 PRs are **greatly welcome**!
 Please run the quickstart instructions followed by the below to setup env:
 ```bash
 pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install
 # test
 pytest tests/
 # optional: run against all files
 pre-commit run --all-files
 ```
 Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
 <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
 </a>
 ## Sponsors 🤝❤
 OpenAccess AI Collective is run by volunteer contributors such as [winglian](https://github.com/winglian),
 [NanoCode012](https://github.com/NanoCode012), [tmm1](https://github.com/tmm1),
 [mhenrichsen](https://github.com/mhenrichsen), [casper-hansen](https://github.com/casper-hansen),
 [hamelsmu](https://github.com/hamelsmu) and many more who help us accelerate forward by fixing bugs, answering
 community questions and implementing new features. Axolotl needs donations from sponsors for the compute needed to
 run our unit & integration tests, troubleshooting community issues, and providing bounties. If you love axolotl,
 consider sponsoring the project via [GitHub Sponsors](https://github.com/sponsors/OpenAccess-AI-Collective),
 [Ko-fi](https://ko-fi.com/axolotl_ai) or reach out directly to
 [wing@openaccessaicollective.org](mailto:wing@openaccessaicollective.org).
 ---
 #### 💎 Diamond Sponsors - [Contact directly](mailto:wing@openaccessaicollective.org)
 ---
 #### 🥇 Gold Sponsors - $5000/mo
 ---
 #### 🥈 Silver Sponsors - $1000/mo
 ---
 #### 🥉 Bronze Sponsors - $500/mo
 - [JarvisLabs.ai](https://jarvislabs.ai)
 ---
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,14 +1,14 @@
-FROM axolotlai/axolotl-base:{{ BASE_TAG }}
+FROM winglian/axolotl-base:{{ BASE_TAG }}
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
 ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
 ENV CUDA="{{ CUDA }}"
 ENV BNB_CUDA_VERSION="{{ CUDA }}"
 ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
 ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
@@ -23,25 +23,21 @@ RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
    fi
 RUN python scripts/unsloth_install.py | sh
 RUN python scripts/cutcrossentropy_install.py | sh
 # So we can test the Docker image
-RUN pip install -r requirements-dev.txt -r requirements-tests.txt
+RUN pip install -r requirements-tests.txt
 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,11 +1,6 @@
 #!/bin/bash
 set -e
-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
+pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-
+pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/
-pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
+pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
 # pytest -v --durations=10 -n8 --dist loadfile /workspace/axolotl/tests/patched/
 pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/
 pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
 pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
 pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import App, Image
+from modal import Image, Stub
 cicd_path = pathlib.Path(__file__).parent.resolve()
@@ -28,7 +28,6 @@ df_args = {
    "CUDA": os.environ.get("CUDA", "121"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
 dockerfile_contents = df_template.render(**df_args)
@@ -47,14 +46,8 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )
-app = App("Axolotl CI/CD", secrets=[])
+stub = Stub("Axolotl CI/CD", secrets=[])
 hf_cache_volume = modal.Volume.from_name(
    "axolotl-ci-hf-hub-cache", create_if_missing=True
 )
 VOLUME_CONFIG = {
    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
 }
 N_GPUS = int(os.environ.get("N_GPUS", 2))
 GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
@@ -68,18 +61,17 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit
-@app.function(
+@stub.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=60 * 60,
+    timeout=45 * 60,
    cpu=8.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
 )
 def cicd_pytest():
    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
-@app.local_entrypoint()
+@stub.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,4 +2,4 @@
 set -e
 # only run one test at a time so as not to OOM the GPU
-pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
+pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -10,7 +10,7 @@ import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
-from modal import App, Image
+from modal import Image, Stub
 cicd_path = pathlib.Path(__file__).parent.resolve()
@@ -29,7 +29,6 @@ df_args = {
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
 }
 dockerfile_contents = df_template.render(**df_args)
@@ -41,7 +40,6 @@ with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
 cicd_image = (
    Image.from_dockerfile(
        pathlib.Path(temp_dir) / "Dockerfile",
        context_mount=None,
        force_build=True,
        gpu="A10G",
    )
@@ -49,14 +47,8 @@ cicd_image = (
    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
 )
-app = App("Axolotl CI/CD", secrets=[])
+stub = Stub("Axolotl CI/CD", secrets=[])
 hf_cache_volume = modal.Volume.from_name(
    "axolotl-ci-hf-hub-cache", create_if_missing=True
 )
 VOLUME_CONFIG = {
    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
 }
 N_GPUS = int(os.environ.get("N_GPUS", 1))
 GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
@@ -70,18 +62,17 @@ def run_cmd(cmd: str, run_folder: str):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit
-@app.function(
+@stub.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=60 * 60,
+    timeout=45 * 60,
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
 )
 def cicd_pytest():
    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
-@app.local_entrypoint()
+@stub.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/deepspeed_configs/zero1_torch_compile.json
+++ b/deepspeed_configs/zero1_torch_compile.json
@@ -1,27 +0,0 @@
 {
  "zero_optimization": {
    "stage": 1,
    "overlap_comm": true
  },
  "bf16": {
    "enabled": "auto"
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "compile": {
    "disable": false,
    "backend": "inductor"
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
 }
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -14,6 +14,15 @@
  "bf16": {
    "enabled": true
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
@@ -24,6 +24,15 @@
  "bf16": {
    "enabled": true
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
@@ -20,6 +20,15 @@
  "bf16": {
    "enabled": true
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -1,4 +1,4 @@
-# Example config for debugging the chat_template prompt format
+# Example config for debugging the sharegpt prompt format
 base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
@@ -7,8 +7,8 @@ load_in_8bit: true
 load_in_4bit: false
 datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
+  - path: philschmid/guanaco-sharegpt-style
-    type: chat_template
+    type: sharegpt
    shards: 10
 val_set_size: 0
 output_dir: temp_debug/axolotl_outputs/model
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,10 +1,11 @@
 ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base:$BASE_TAG
+FROM winglian/axolotl-base:$BASE_TAG
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
 ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.1.2"
 ENV PYTORCH_VERSION=$PYTORCH_VERSION
@@ -19,15 +20,13 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
 WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
    fi
 RUN python scripts/unsloth_install.py | sh
 RUN python scripts/cutcrossentropy_install.py | sh
 # So we can test the Docker image
 RUN pip install pytest
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -16,7 +16,7 @@ ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
 RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
@@ -29,9 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -1,8 +1,8 @@
 ARG BASE_TAG=main
-FROM axolotlai/axolotl:$BASE_TAG
+FROM winglian/axolotl:$BASE_TAG
 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"
@@ -20,8 +20,7 @@ RUN apt install --yes --no-install-recommends openssh-server tmux && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
-    chmod +x /root/cloud-entrypoint.sh && \
+    chmod +x /root/cloud-entrypoint.sh
    echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
 ENTRYPOINT ["/root/cloud-entrypoint.sh"]
 CMD ["sleep", "infinity"]
--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -1,8 +1,8 @@
 ARG BASE_TAG=main
-FROM axolotlai/axolotl:$BASE_TAG
+FROM winglian/axolotl:$BASE_TAG
 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -1,10 +1,11 @@
 ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base:$BASE_TAG
+FROM winglian/axolotl-base:$BASE_TAG
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
 ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.1.2"
 ARG GITHUB_REF="main"
@@ -24,9 +25,9 @@ RUN git fetch origin +$GITHUB_REF && \
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
    fi
 # So we can test the Docker image
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -52,7 +52,7 @@ export GPU_ARCHS="gfx90a"
 cd flash-attention
 export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
 patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
-pip install --no-build-isolation .
+pip install .
 ```
 ### 6. Install Axolotl
@@ -63,7 +63,7 @@ Clone and install Axolotl:
 git clone https://github.com/axolotl-ai-cloud/axolotl
 cd axolotl
 pip install packaging ninja
-pip install --no-build-isolation -e .
+pip install -e .
 ```
 ### 7. Apply xformers Workaround
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -83,7 +83,7 @@ lora_on_cpu: true
 datasets:
  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
-    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
+  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
    data_files: # Optional[str] path to source data files
@@ -91,7 +91,15 @@ datasets:
    name: # Optional[str] name of dataset configuration to load
    train_on_split: train # Optional[str] name of dataset split to load from
    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
-    trust_remote_code: # Optional[bool] Trust remote code for untrusted source
+
    # Optional[str] fastchat conversation type, only used with type: sharegpt
    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    field_human: # Optional[str]. Human key to use for conversation.
    field_model: # Optional[str]. Assistant key to use for conversation.
    # Add additional keys from your dataset as input or output roles
    roles:
      input: # Optional[List[str]]. These will be masked based on train_on_input
      output: # Optional[List[str]].
  # Custom user instruction prompt
  - path: repo
@@ -116,61 +124,10 @@ datasets:
      # For `completion` datsets only, uses the provided field instead of `text` column
      field:
  # Using chat template
  - path: ...
    # Set type to `chat_template` to use this strategy
    type: chat_template
    # Specify the name of the chat template to use
    # The name of the chat template to use for training, following values are supported:
    # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.
    # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
    # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
    # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
    chat_template: tokenizer_default
    # Custom jinja chat template. Used only if `chat_template: jinja` or empty.
    chat_template_jinja:
    # Key containing the messages (default: "messages")
    field_messages: messages
    # Key for role in each message (default: "role")
    message_field_role: role
    # Key for content in each message (default:  "content")
    message_field_content: content
    # Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
    roles:
      user: ["human", "user"]
      assistant: ["gpt", "assistant"]
      system: ["system"]
      tool: ["tool"]
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
    # Note: If the below 4 fields are empty, defaults to training only on the last message.
    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
    roles_to_train: ["assistant"]  # default
    # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
    # - all: train on all EOS tokens
    # - turn (default): train on the EOS token at the end of each trainable turn
    # - last: train on the last EOS token in the conversation
    train_on_eos: last
    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
    message_field_training: training
    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
    # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
    message_field_training_detail: train_detail
 # If false, the datasets will not be shuffled and will keep their original order in `datasets`.
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true
 Deduplicates datasets and test_datasets with identical entries.
 dataset_exact_deduplication: true
 # A list of one or more datasets to eval the model with.
 # You can use either test_datasets, or val_set_size, but not both.
 test_datasets:
@@ -184,19 +141,10 @@ test_datasets:
 # use RL training: 'dpo', 'ipo', 'kto'
 rl:
 # whether to perform weighting if doing DPO training. Boolean.
 dpo_use_weighting:
-# The name of the chat template to use for training, following values are supported:
+# Saves the desired chat template to the tokenizer_config.json for easier inferencing
-# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
+# Currently supports chatml and inst (mistral/mixtral)
-# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
+chat_template: chatml
 # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.
 # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
 # The selected chat template will be saved to the tokenizer_config.json for easier inferencing
 # Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.
 chat_template: tokenizer_default
 # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
 chat_template_jinja: null
 # Changes the default system message
 default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
@@ -244,11 +192,6 @@ total_num_tokens:
 sample_packing_group_size: 100000
 # The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
 sample_packing_bin_size: 200
 # whether to concatenate samples during pretraining
 pretraining_sample_concatenation:
 # Use batch flattening for speedups when not using sample_packing
 batch_flattening:
 # Passed through to transformers when loading the model when launched without accelerate
 # Use `sequential` when training w/ model parallelism to limit memory
@@ -342,8 +285,7 @@ comet_experiment_config: # Dictionary for additional configuration settings, see
 output_dir: ./completed-model
 # Whether to use torch.compile and which backend to use
-# setting to `auto` will enable torch compile when torch>=2.5.1
+torch_compile:  # bool
 torch_compile:  # Optional[Union[Literal["auto"], bool]]
 torch_compile_backend:  # Optional[str]
 # Training hyperparameters
@@ -360,11 +302,10 @@ warmup_ratio: 0.05  # cannot use with warmup_steps
 learning_rate: 0.00003
 lr_quadratic_warmup:
 logging_steps:
-eval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps
+eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
 evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
-eval_strategy: # Set to `"no"` to skip evaluation, `"epoch"` at end of each epoch, leave empty to infer from `eval_steps`.
+save_strategy: # Set to `"no"` to skip checkpoint saves
-save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of each epoch, `"best"` when better result is achieved, leave empty to infer from `save_steps`.
+save_steps: # Leave empty to save at each epoch
 save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
 # Maximum number of iterations to train for. It precedes num_epochs which means that
@@ -376,10 +317,6 @@ eval_table_size: # Approximate number of predictions sent to wandb depending on
 eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
 profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
                # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information
                # snapshots can be visualized @ https://pytorch.org/memory_viz
 loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
 loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
@@ -426,7 +363,6 @@ lr_div_factor: # Learning rate div factor
 # - adamw_torch_fused
 # - adamw_torch_xla
 # - adamw_apex_fused
 # - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
 # - adafactor
 # - adamw_anyprecision
 # - sgd
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -6,8 +6,31 @@ order: 3
 ## sharegpt
-IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.
+conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
 ```{.json filename="data.jsonl"}
 {"conversations": [{"from": "...", "value": "..."}]}
 ```
 Note: `type: sharegpt` opens special configs:
 - `conversation`: enables conversions to many Conversation types. Refer to the 'name' [here](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) for options.
 - `roles`: allows you to specify the roles for input and output. This is useful for datasets with custom roles such as `tool` etc to support masking.
 - `field_human`: specify the key to use instead of `human` in the conversation.
 - `field_model`: specify the key to use instead of `gpt` in the conversation.
 ```yaml
 datasets:
    path: ...
    type: sharegpt
    conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
    field_human: # Optional[str]. Human key to use for conversation.
    field_model: # Optional[str]. Assistant key to use for conversation.
    # Add additional keys from your dataset as input or output roles
    roles:
      input: # Optional[List[str]]. These will be masked based on train_on_input
      output: # Optional[List[str]].
 ```
 ## pygmalion
@@ -15,137 +38,34 @@ IMPORTANT: ShareGPT is deprecated!. Please see `chat_template` section below.
 {"conversations": [{"role": "...", "value": "..."}]}
 ```
 ## sharegpt.load_role
-## chat_template
+conversations where `role` is used instead of `from`
 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
 ```{.json filename="data.jsonl"}
-{"conversations": [{"role": "...", "content": "..."}]}
+{"conversations": [{"role": "...", "value": "..."}]}
 ```
-See `config.qmd` for full configs and supported templates.
+## sharegpt.load_guanaco
-### Migrating from sharegpt
+conversations where `from` is `prompter` `assistant` instead of default sharegpt
 Most configs can be adapted as follows:
 ```yaml
 # old
 chat_template: chatml
 datasets:
  - path: ...
    type: sharegpt
    conversation: chatml
 # new (if using tokenizer's chat_template)
 datasets:
  - path: ...
    type: chat_template
    field_messages: conversations
    message_field_role: from
    message_field_content: value
 # new (if setting a new chat_template like chatml, gemma, etc)
 chat_template: chatml
 datasets:
  - path: ...
    type: chat_template
    field_messages: conversations
    message_field_role: from
    message_field_content: value
 ```
 We recommend checking the below examples for other usecases.
 ### Examples
 1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
 ```yaml
 datasets:
  - path: ...
    type: chat_template
    roles_to_train:
    train_on_eos:
 ```
 2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
 datasets:
  - path: ...
    type: chat_template
    roles_to_train: ["assistant"]  # default value
 ```
 3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
 datasets:
  - path: ...
    type: chat_template
 ```
 4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
 chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
 datasets:
  - path: ...
    type: chat_template
 ```
 5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 For a data sample that looks like:
 ```{.json filename="data.jsonl"}
-{
+{"conversations": [{"from": "...", "value": "..."}]}
  "conversations": [
    {"from": "system", "value": "You are an AI assistant.", "train": false},
    {"from": "human", "value": "Hello", "train": false},
    {"from": "assistant", "value": "Hello", "train": true},
    {"from": "human", "value": "How are you?", "train": true},
    {
      "from": "assistant",
      "value": "I'm doing very well, thank you!",
      "train_detail": [
        {"begin_offset": 0, "end_offset": 8, "train": false},
        {"begin_offset": 9, "end_offset": 18, "train": true},
        {"begin_offset": 19, "end_offset": 30, "train": false},
      ],
    },
    {
        "from": "human",
        "value": "I'm doing very well, thank you!",
        "train": true,
    },
    {"from": "assistant", "value": "Hi there!", "train": true}
  ]
 }
 ```
-The configuration would look like:
+## sharegpt.load_ultrachat
-```yaml
+conversations where the turns field is 'messages', human is 'user' and gpt is 'assistant'.
-datasets:
+
-  - path: ...
+```{.json filename="data.jsonl"}
-    type: chat_template
+{"messages": [{"user": "...", "assistant": "..."}]}
    chat_template: tokenizer_default
    field_messages: conversations
    message_field_role: from
    message_field_content: value
    roles_to_train: []
    train_on_eos: turn
    message_field_training: train
    message_field_training_detail: train_detail
 ```
-Tip: It is not necessary to use both `message_field_training` and `message_field_training_detail` at a time.
+## sharegpt_jokes
 creates a chat where bot is asked to tell a joke, then explain why the joke is funny
 ```{.json filename="data.jsonl"}
 {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
 ```
--- a/docs/dataset-formats/pretraining.qmd
+++ b/docs/dataset-formats/pretraining.qmd
@@ -19,14 +19,7 @@ For pretraining, there is no prompt template or roles.  The only required field
 Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:
 ```{.yaml filename="config.yaml"}
-pretraining_dataset:
+pretraining_dataset: # hf path only
  - name:
    path:
    split:
    text_column: # column in dataset with the data, usually `text`
    type: pretrain
    trust_remote_code:
    skip: # number of rows of data to skip over from the beginning
 ...
 ```
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -51,12 +51,12 @@ While debugging it's helpful to simplify your test scenario as much as possible.
 ### Background
-The below example shows how to configure VSCode to debug data preprocessing of the `chat_template` format.  This is the format used when you have the following in your axolotl config:
+The below example shows how to configure VSCode to debug data preprocessing of the `sharegpt` format.  This is the format used when you have the following in your axolotl config:
 ```yaml
 datasets:
-  - path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test
+  - path: <path to your sharegpt formatted dataset> # example on HF Hub: philschmid/guanaco-sharegpt-style
-    type: chat_template
+    type: sharegpt
 ```
 >[!Important]
@@ -71,7 +71,7 @@ Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/us
 ```bash
 pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+pip3 install -e '.[flash-attn,deepspeed]'
 ```
 #### Remote Hosts
@@ -83,7 +83,7 @@ If you developing on a remote host, you can easily use VSCode to debug remotely.
 The easiest way to get started is to modify the [.vscode/launch.json](../.vscode/launch.json) file in this project.  This is just an example configuration, so you may need to modify or copy it to suit your needs.
-For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml`, you would use the below configuration[^1].  Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted.  This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
+For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_sharegpt.yml`, you would use the below configuration[^1].  Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted.  This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.
 ```jsonc
 // .vscode/launch.json
@@ -91,12 +91,12 @@ For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 acceler
    "version": "0.2.0",
    "configurations": [
        {
-            "name": "Debug axolotl prompt - chat_template",
+            "name": "Debug axolotl prompt - sharegpt",
            "type": "python",
            "module": "accelerate.commands.launch",
            "request": "launch",
            "args": [
-                "-m", "axolotl.cli.train", "dev_chat_template.yml",
+                "-m", "axolotl.cli.train", "dev_sharegpt.yml",
                // The flags below simplify debugging by overriding the axolotl config
                // with the debugging tips above.  Modify as needed.
                "--dataset_processes=1",      // limits data preprocessing to one process
@@ -185,7 +185,7 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3
 ## Debugging With Docker
-Using [official Axolotl Docker images](https://hub.docker.com/r/axolotlai/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.
+Using [official Axolotl Docker images](https://hub.docker.com/r/winglian/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.
 ### Setup
@@ -202,17 +202,17 @@ cd axolotl
 Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]
 ```bash
-docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
+docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface winglian/axolotl:main-py3.10-cu118-2.0.1
 ```
 >[!Tip]
-> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
+> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/winglian/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).
 You will now be in the container.  Next, perform an editable install of Axolotl:
 ```bash
 pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+pip3 install -e '.[flash-attn,deepspeed]'
 ```
 ### Attach To Container
@@ -240,6 +240,6 @@ style="border-radius: 10px; display: block; margin: auto;" width="560" height="3
 </div>
 <br>
-[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml`, but this is the same thing.
+[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/sharegpt.yml`, but this is the same thing.
 [^2]: Many of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit.  You can read more about these flags [here](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html).
--- a/docs/lr_groups.qmd
+++ b/docs/lr_groups.qmd
@@ -1,29 +0,0 @@
 ---
 title: Learning Rate Groups
 description: "Setting different learning rates by module name"
 ---
 ## Background
 Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of
 modules in a model.
 ## Example
 ```yaml
 lr_groups:
  - name: o_proj
    modules:
      - self_attn.o_proj.weight
    lr: 1e-6
  - name: q_proj
    modules:
      - model.layers.2.self_attn.q_proj.weight
    lr: 1e-5
 learning_rate: 2e-5
 ```
 In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
 of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
 self attention `q_proj` module.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -52,26 +52,6 @@ datasets:
    type: chat_template.argilla
 ```
 #### KTO
 ```yaml
 rl: kto
 rl_beta: 0.5
 kto_desirable_weight: 0.2
 remove_unused_columns: false
 datasets:
  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    type: llama3.ultra
    split: train
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 ```
 #### Using local dataset files
 ```yaml
 datasets:
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -11,10 +11,12 @@ standard industry baselines.
 ### Installation
-The following will install the correct unsloth and extras from source.
+The following will install unsloth from source and downgrade xformers as unsloth is incompatible with the most up
 to date libraries.
 ```bash
-python scripts/unsloth_install.py | sh
+pip install --no-deps "unsloth @ git+https://github.com/unslothai/unsloth.git"
 pip install --no-deps --force-reinstall xformers==0.0.26.post1
 ```
 ### Using unsloth w Axolotl
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -1,10 +1,6 @@
 base_model: cerebras/btlm-3b-8k-base
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: GPT2Tokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -1,7 +1,4 @@
 base_model: cerebras/Cerebras-GPT-1.3B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 strict: false
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -1,9 +1,6 @@
 base_model: codellama/CodeLlama-13b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -1,9 +1,6 @@
 base_model: codellama/CodeLlama-13b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -1,9 +1,6 @@
 base_model: codellama/CodeLlama-34b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -1,9 +1,6 @@
 base_model: codellama/CodeLlama-34b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -1,9 +1,6 @@
 base_model: codellama/CodeLlama-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -1,9 +1,6 @@
 base_model: codellama/CodeLlama-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: CodeLlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -2,15 +2,19 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
    "id": "AKjdG7tbTb-n"
   },
   "source": [
-    "## Setting up"
+    "# Example notebook for running Axolotl on google colab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {},
+   "metadata": {
    "id": "RcbNpOgWRcii"
   },
   "outputs": [],
   "source": [
    "import torch\n",
@@ -18,76 +22,82 @@
    "assert (torch.cuda.is_available()==True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "h3nLav8oTRA5"
   },
   "source": [
    "## Install Axolotl and dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {},
+   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "3c3yGAwnOIdi",
    "outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
   },
   "outputs": [],
   "source": [
-    "!pip install --no-build-isolation axolotl[deepspeed]"
+    "!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
    "!pip install flash-attn==\"2.5.0\"\n",
    "!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
    "id": "BW2MFr7HTjub"
   },
   "source": [
-    "## Hugging Face login (optional)"
+    "## Create an yaml config file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {},
+   "metadata": {
-   "outputs": [],
+    "id": "9pkF2dSoQEUN"
-   "source": [
+   },
    "from huggingface_hub import notebook_login\n",
    "notebook_login()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example configuration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import yaml\n",
    "\n",
    "# Your YAML string\n",
    "yaml_string = \"\"\"\n",
-    "base_model: NousResearch/Meta-Llama-3.1-8B\n",
+    "base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
    "model_type: LlamaForCausalLM\n",
    "tokenizer_type: LlamaTokenizer\n",
    "\n",
    "load_in_8bit: false\n",
    "load_in_4bit: true\n",
    "strict: false\n",
    "\n",
    "datasets:\n",
-    "  - path: tatsu-lab/alpaca\n",
+    "  - path: mhenrichsen/alpaca_2k_test\n",
    "    type: alpaca\n",
-    "dataset_prepared_path: last_run_prepared\n",
+    "dataset_prepared_path:\n",
    "val_set_size: 0.05\n",
-    "output_dir: ./outputs/lora-out\n",
+    "output_dir: ./outputs/qlora-out\n",
    "\n",
    "sequence_len: 2048\n",
    "sample_packing: true\n",
    "eval_sample_packing: true\n",
    "pad_to_sequence_len: true\n",
    "\n",
    "adapter: qlora\n",
    "lora_model_dir:\n",
    "\n",
    "sequence_len: 4096\n",
    "sample_packing: true\n",
    "eval_sample_packing: false\n",
    "pad_to_sequence_len: true\n",
    "\n",
    "lora_r: 32\n",
    "lora_alpha: 16\n",
    "lora_dropout: 0.05\n",
    "lora_target_modules:\n",
    "lora_target_linear: true\n",
    "lora_fan_in_fan_out:\n",
    "lora_modules_to_save:\n",
    "  - embed_tokens\n",
    "  - lm_head\n",
    "\n",
    "wandb_project:\n",
    "wandb_entity:\n",
@@ -95,12 +105,12 @@
    "wandb_name:\n",
    "wandb_log_model:\n",
    "\n",
-    "gradient_accumulation_steps: 2\n",
+    "gradient_accumulation_steps: 4\n",
-    "micro_batch_size: 1\n",
+    "micro_batch_size: 2\n",
-    "num_epochs: 1\n",
+    "num_epochs: 4\n",
-    "optimizer: paged_adamw_8bit\n",
+    "optimizer: paged_adamw_32bit\n",
    "lr_scheduler: cosine\n",
-    "learning_rate: 2e-5\n",
+    "learning_rate: 0.0002\n",
    "\n",
    "train_on_inputs: false\n",
    "group_by_length: false\n",
@@ -111,15 +121,13 @@
    "gradient_checkpointing: true\n",
    "early_stopping_patience:\n",
    "resume_from_checkpoint:\n",
    "local_rank:\n",
    "logging_steps: 1\n",
    "xformers_attention:\n",
-    "flash_attention: false\n",
+    "flash_attention: true\n",
    "sdp_attention: true\n",
    "\n",
-    "warmup_steps: 1\n",
+    "warmup_steps: 10\n",
-    "max_steps: 25\n",
+    "evals_per_epoch: 4\n",
    "evals_per_epoch: 1\n",
    "eval_table_size:\n",
    "saves_per_epoch: 1\n",
    "debug:\n",
    "deepspeed:\n",
@@ -127,9 +135,8 @@
    "fsdp:\n",
    "fsdp_config:\n",
    "special_tokens:\n",
    "  pad_token: <|end_of_text|>\n",
    "\"\"\"\n",
    "\n",
    "\"\"\"\n",
    "\n",
    "# Convert the YAML string to a Python dictionary\n",
    "yaml_dict = yaml.safe_load(yaml_string)\n",
@@ -139,124 +146,31 @@
    "\n",
    "# Write the YAML file\n",
    "with open(file_path, 'w') as file:\n",
-    "    yaml.dump(yaml_dict, file)"
+    "    yaml.dump(yaml_dict, file)\n"
   ]
  },
  {
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
    "id": "bidoj8YLTusD"
   },
   "source": [
-    "Above we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\n",
+    "## Launch the training"
    "\n",
    "The Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let's go through them line by line:\n",
    "\n",
    "*   \"base model\": String value, specifies the underlying pre-trained LLM that will be used for finetuning\n",
    "\n",
    "Next we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n",
    "\n",
    "*   \"load_in_8bit\": Boolean value, whether to quantize the model weights into 8-bit integer.\n",
    "\n",
    "*   \"load_in_4bit\": Boolean value, whether to quantize the model weights into 4-bit integer.\n",
    "\n",
    "*   \"strict\": Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n",
    "\n",
    "*   \"datasets\": a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n",
    "\n",
    "*   \"val_set_size\": Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n",
    "\n",
    "*   \"output_dir\": String value. Path of trained model.\n",
    "\n",
    "For data preprocessing:\n",
    "\n",
    "*   \"sequence_len\": Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n",
    "\n",
    "*   \"pad_to_sequence_len\": Boolean. Padding input to maximum sequence length.\n",
    "\n",
    "*   \"sample_packing\": Boolean. Specifies whether to use multi-packing with block diagonal attention.\n",
    "\n",
    "*   \"special_tokens\": Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n",
    "\n",
    "For LoRA configuration and its hyperparamters:\n",
    "\n",
    "*   \"adapter\": String. Either \"lora\" or \"qlora\", depending on user's choice.\n",
    "\n",
    "*   \"lora_model_dir\": String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n",
    "\n",
    "*   \"lora_r\": Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n",
    "\n",
    "*   \"lora_alpha\": Integer. Scale the weight matrices by $\\frac{\\text{lora_alpha}}{\\text{lora_r}}$Recommended to be fixed at 16.\n",
    "\n",
    "*   \"lora_dropout\": Float that is 1 or less. The dropout probability of a lora layer.\n",
    "\n",
    "*   \"lora_target_linear\": Boolean. If true, lora will target all linear modules in the transformers architecture.\n",
    "\n",
    "*   \"lora_modules_to_save\": If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n",
    "\n",
    "See [LoRA](https://arxiv.org/abs/2106.09685) for detailed explanation of LoRA implementation.\n",
    "\n",
    "For the training configurations:\n",
    "\n",
    "*   \"gradient_accumulation_steps\": Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n",
    "\n",
    "*   \"micro_batch_size\": Integer. Batch size per gpu / gradient_accumulation_steps\n",
    "\n",
    "*   \"num_epochs\": Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n",
    "\n",
    "*   \"optimizer\": The optimizer to use for the training.\n",
    "\n",
    "*   \"learning_rate\": The learning rate.\n",
    "\n",
    "*   \"lr_scheduler\": The learning rate scheduler to use for adjusting learning rate during training.\n",
    "\n",
    "*   \"train_on_inputs\": Boolean. Whether to ignore or include the user's prompt from the training labels.\n",
    "\n",
    "*   \"group_by_length\": Boolean. Whether to group similarly sized data to minimize padding.\n",
    "\n",
    "*   \"bf16\": Either \"auto\", \"true\", or \"false\". Whether to use CUDA bf16 floating point format. If set to \"auto\", will automatically apply bf16 should the gpu supports it.\n",
    "\n",
    "*   \"fp16\": Optional. Specifies whether to use CUDA fp16. Automatically set to true if \"bf16\" is set to true. Otherwise false.\n",
    "\n",
    "*   \"tf32\": Boolean. Whether to use CUDA tf32. Will override bf16.\n",
    "\n",
    "*   \"gradient_checkpointing\": Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n",
    "\n",
    "*   \"gradient_checkpointing_kwargs\": Python Dict. Fed into the trainer.\n",
    "\n",
    "*   \"logging_steps\": Integer. Log training information over every specified number of steps.\n",
    "\n",
    "*   \"flash_attention\": Boolean. Whether to use the [flash attention](https://github.com/Dao-AILab/flash-attention) mechanism.\n",
    "\n",
    "*   \"sdp_attention\": Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the [original implementation](https://arxiv.org/abs/1706.03762) of transformers.)\n",
    "\n",
    "*   \"warmup_steps\": Integer. The number of pre-training steps where a very low learning rate is used.\n",
    "\n",
    "*   \"evals_per_epoch\": Integer. Number of evaluations to be performed within one training epoch.\n",
    "\n",
    "*   \"saves_per_epoch\": Integer. Number of times the model is saved in one training epoch.\n",
    "\n",
    "*   \"weight_decay\": Positive Float. Sets the \"strength\" of weight decay (i.e. setting the coefficient of L2 regularization)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see [here](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Train the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "metadata": {},
+   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "ydTI2Jk2RStU",
    "outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
   },
   "outputs": [],
   "source": [
    "# By using the ! the comand will be executed as a bash command\n",
    "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
   ]
  },
@@ -264,7 +178,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Predict with trained model"
+    "## Play with inference"
   ]
  },
  {
@@ -273,85 +187,36 @@
   "metadata": {},
   "outputs": [],
   "source": [
    "# By using the ! the comand will be executed as a bash command\n",
    "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
-    "    --lora_model_dir=\"./outputs/lora-out\" --gradio"
+    "    --qlora_model_dir=\"./qlora-out\" --gradio"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Deeper Dive"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It is also helpful to gain some familiarity over some of the core inner workings of axolotl"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Configuration Normalization"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Axolotl uses a custom Dict class, called ```DictDefault```\n",
    "to store configurations specified in the yaml configuration file (into a Python variable named ```cfg```). The definition for this custom Dict can be found in the [utils/dict.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/dict.py)\n",
    "\n",
    "```DictDefault``` is amended such that calling a missing key from it will result in a ```None``` return type. This is important because if some configuration options aren't specified by the user, the ```None``` type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out [utils/config/__init__.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/__init__.py)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading Models, Tokenizers, and Trainer"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we inspect [cli.train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/cli/train.py), we will find that most of the heavy lifting were done by the function ```train()``` which is itself imported from [src/axolotl/train.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/train.py).\n",
    "\n",
    "```train()``` takes care of loading the appropriate tokenizer and pre-trained model through ```load_model()``` and ```load_tokenizer()``` from [src/axolotl/utils/models.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/models.py) respectively.\n",
    "\n",
    "```load_tokenizer()``` loads in the appropriate tokenizer given the desired model, as well as chat templates.\n",
    "\n",
    "```ModelLoader``` class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ```ModelLoader``` will utilize the corresponding \"attention hijacking\" script. For example, if the user specified the base model to be ```NousResearch/Meta-Llama-3.1-8B```, which is of llama type, and set ```flash_attn``` to ```True```, ```ModelLoader``` will load in [llama_attn_hijack_flash.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/monkeypatch/llama_attn_hijack_flash.py). For a list of supported attention hijacking, please refer to the directory [/src/axolotl/monkeypatch/](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch)\n",
    "\n",
    "Another important operation encompassed in ```train()``` is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of ```setup_trainer()``` from [/src/axolotl/utils/trainer.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/trainer.py), which in turn relies on modules from [/src/axolotl/core/trainer_builder.py](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/core/trainer_builder.py).\n",
    "```trainer_builder.py``` provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning ('dpo', 'ipo', 'kto') )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Monkey patch\n",
    "\n",
    "The [Monkey patch directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/monkeypatch) is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
-   "version": "3.9.6"
+   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -1,7 +1,4 @@
 base_model: LnL-AI/dbrx-base-converted-v2
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -1,7 +1,4 @@
 base_model: LnL-AI/dbrx-base-converted-v2
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: true
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -1,7 +1,4 @@
 base_model: LnL-AI/dbrx-base-converted-v2
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -1,6 +1,4 @@
 base_model: deepseek-ai/DeepSeek-V2-Lite
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -1,7 +1,4 @@
 base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
@@ -12,17 +9,14 @@ strict: false
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rms_norm: true
-liger_glu_activation: true
+liger_swiglu: true
 liger_fused_linear_cross_entropy: true
 chat_template: deepseek_v2
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
-    split: train[:20%]
+    split: train
    field_messages: conversations
    message_field_role: from
    message_field_content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -1,12 +1,7 @@
 base_model: tiiuae/falcon-7b
-# optionally might have model_type or tokenizer_type
+trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -1,15 +1,10 @@
 # 1b: tiiuae/falcon-rw-1b
 # 40b: tiiuae/falcon-40b
 base_model: tiiuae/falcon-7b
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
-
+model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 load_in_8bit: false
 # enable 4bit for QLoRA
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -1,12 +1,7 @@
 base_model: tiiuae/falcon-7b
-# optionally might have model_type or tokenizer_type
+trust_remote_code: true
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -1,10 +1,7 @@
 # use google/gemma-7b if you have access
 base_model: mhenrichsen/gemma-7b
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -1,9 +1,6 @@
 base_model: google/gemma-2-9b
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
@@ -14,11 +11,8 @@ chat_template: gemma
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    chat_template: gemma
    drop_system_message: true
    field_messages: conversations
    message_field_role: from
    message_field_content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -1,9 +1,6 @@
 base_model: google/gemma-2-2b
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForSequenceClassification
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -1,7 +1,4 @@
 base_model: EleutherAI/gpt-j-6b
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 strict: false
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -1,7 +1,4 @@
 base_model: ai21labs/Jamba-v0.1
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -1,6 +1,4 @@
 base_model: ai21labs/Jamba-v0.1
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -1,21 +1,14 @@
 base_model: ai21labs/AI21-Jamba-1.5-Large
 # optionally might have model_type or tokenizer_type
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
 use_tensorboard: true
 chat_template: jamba
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    chat_template: jamba
    drop_system_message: true
    field_messages: conversations
    message_field_role: from
    message_field_content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: jamba-large-fsdp-qlora-ft
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -1,10 +1,6 @@
 base_model: huggyllama/llama-7b
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 datasets:
  - path: openaccess-ai-collective/jeopardy
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -1,13 +1,8 @@
 base_model: TheBloke/Llama-2-7B-GPTQ
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 gptq: true
 gptq_disable_exllama: true
-
+model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 tokenizer_use_fast: true
 tokenizer_legacy: true
 load_in_8bit: false
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Llama-2-7b-hf
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -1,9 +1,5 @@
 base_model: alpindale/Llama-3.2-11B-Vision-Instruct
 # optionally might have model_type or tokenizer_type or processor_type
 processor_type: AutoProcessor
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
--- a/examples/llama-3/fft-8b-fsdp.yml
+++ b/examples/llama-3/fft-8b-fsdp.yml
@@ -0,0 +1,50 @@
 base_model: meta-llama/Llama-3.1-8B-Instruct
 save_safetensors: true
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: ./last_run_prepared
 output_dir: ./outputs/fft-out
 sequence_len: 8192
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch
 learning_rate: 2e-5
 bf16: auto
 fp16:
 tf32: false
 logging_steps: 2
 xformers_attention:
 flash_attention: true
 warmup_steps: 2
 evals_per_epoch: 2
 save_steps: 2
 max_steps: 2
 weight_decay: 0.0
 fsdp:
  - full_shard
  - auto_wrap
 fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: false
  fsdp_use_orig_params: true
  fsdp_cpu_ram_efficient_loading: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_backward_prefetch: BACKWARD_PRE
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -1,12 +1,10 @@
 base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
-liger_glu_activation: true
+liger_swiglu: true
 liger_fused_linear_cross_entropy: true
 strict: false
@@ -16,10 +14,6 @@ datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_field_role: from
    message_field_content: value
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.02
 output_dir: ./outputs/out
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,6 +1,4 @@
 base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -1,9 +1,6 @@
 base_model: meta-llama/Meta-Llama-3-8B-Instruct
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Meta-Llama-3-8B-Instruct
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -1,98 +0,0 @@
 base_model: meta-llama/Llama-3.2-1B
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 rl: dpo
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_field_role: role
    message_field_content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_field_role: role
    message_field_content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant
 dataset_exact_deduplication: true
 dataset_prepared_path:
 val_set_size: 0
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -1,79 +0,0 @@
 base_model: meta-llama/Llama-3.2-1B
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./outputs/lora-out
 dataset_exact_deduplication: true
 test_value: true
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_modules_to_save:
  - embed_tokens
  - lm_head
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -1,76 +0,0 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 adapter: lora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Meta-Llama-3-8B
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -1,77 +0,0 @@
 base_model: meta-llama/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 rl: kto
 rl_beta: 0.5
 kto_desirable_weight: 0.2
 datasets:
  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    type: llama3.ultra
    split: train
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/qlora-out
 remove_unused_columns: false
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: false  # not supported with kto
 eval_sample_packing: false
 pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 64
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 20
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -1,78 +0,0 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/qlora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|end_of_text|>"
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -1,8 +1,5 @@
 base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16
 # optionally might have model_type or tokenizer_type
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -1,9 +1,6 @@
 base_model: casperhansen/llama-3-70b-fp16
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer  # PreTrainedTokenizerFast
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -1,9 +1,6 @@
 base_model: NousResearch/Meta-Llama-3-8B
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -1,10 +1,7 @@
 base_model: state-spaces/mamba-2.8b
 # optionally might have model_type or tokenizer_type or tokenizer_config
 model_type: MambaLMHeadModel
 tokenizer_type: AutoTokenizer
 tokenizer_config: EleutherAI/gpt-neox-20b
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -1,10 +1,6 @@
 base_model: mistral-community/Mixtral-8x22B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -1,9 +1,6 @@
 base_model: mistralai/Mistral-7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -1,9 +1,6 @@
 base_model: mistralai/Mistral-7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -1,9 +1,6 @@
 base_model: mistralai/Mistral-7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -1,96 +0,0 @@
 #Note that we are switching from the regular chat template to chatml.
 #If you experience problems with the special tokens, training for more epochs can help.
 #After training, merge the model before inference otherwise you might
 #face problems with the special tokens.
 base_model: mistralai/Mistral-7B-Instruct-v0.2
 # optionally might have model_type or tokenizer_type
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 chat_template: chatml
 rl: dpo
 datasets:
  - path: olivermolenschot/alpaca_messages_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_field_role: role
    message_field_content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/dpo-qlora
 sequence_len: 2048
 sample_packing: false
 pad_to_sequence_len: true
 adapter: qlora
 lora_model_dir:
 lora_r: 8
 lora_alpha: 16
 lora_dropout: 0.2
 lora_target_linear: true
 lora_fan_in_fan_out:
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 lora_modules_to_save:
 - embed_tokens
 - lm_head
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 16
 num_epochs: 6
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: false
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<|im_start|>"
  eos_token: "<|im_end|>"
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -1,10 +1,6 @@
 base_model: mistralai/Mixtral-8x7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -1,9 +1,6 @@
 base_model: mistralai/Mistral-7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -1,9 +1,6 @@
 base_model: mistral-community/Mixtral-8x22B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -1,10 +1,6 @@
 base_model: mistralai/Mixtral-8x7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -1,10 +1,6 @@
 base_model: mistralai/Mixtral-8x7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -1,10 +1,6 @@
 base_model: mistral-community/Mixtral-8x22B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 load_in_8bit: false
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -1,9 +1,6 @@
 base_model: mistralai/Mistral-7B-v0.1
 # optionally might have model_type or tokenizer_type
 model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -1,9 +1,5 @@
 base_model: mosaicml/mpt-7b
 # optionally might have model_type or tokenizer_type
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true  # required for mpt as their model class is not merged into transformers yet
 load_in_8bit: false
 datasets:
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -1,10 +1,6 @@
 base_model: openlm-research/open_llama_3b_v2
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -1,10 +1,6 @@
 base_model: openlm-research/open_llama_3b_v2
 # optionally might have model_type or tokenizer_type
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
sunny	66a1e209e3	changed yml	2024-10-18 10:46:36 -04:00
sunny	dba033cb5b	fixed yml for issue1947 testing	2024-10-17 12:31:11 -04:00
sunny	6a32e9a0df	added yml for testing issue 1947	2024-10-17 11:26:32 -04:00
sunny	4afb2656b3	added yml for testing issue 1947	2024-10-17 11:23:47 -04:00