fix the lora yaml for l3

2024-04-19 07:28:07 -04:00
225 changed files with 1320 additions and 10872 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -21,12 +21,12 @@ All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT

 ## Getting Started

-Bugs? Please check for open issue else create a new [Issue](https://github.com/axolotl-ai-cloud/axolotl/issues/new).
+Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).

 PRs are **greatly welcome**!

 1. Fork the repository and clone it to your local machine.
-2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
+2. Set up the development environment by following the instructions in the [README.md](https://github.com/OpenAccess-AI-Collective/axolotl/tree/main/README.md) file.
 3. Explore the codebase, run tests, and verify that everything works as expected.

 Please run below to setup env
@@ -42,11 +42,11 @@ pytest tests/

 ### Reporting Bugs

-If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.
+If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.

 ### Suggesting Enhancements

-We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.
+We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.

 ### Submitting Pull Requests

--- a/.github/ISSUE_TEMPLATE/bug-report.yaml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yaml
@@ -15,7 +15,7 @@ body:
      label: "Please check that this issue hasn't been reported before."
      description: "The **Label filters** may help make your search more focussed."
      options:
-        - label: "I searched previous [Bug Reports](https://github.com/axolotl-ai-cloud/axolotl/labels/bug) didn't find any similar reports."
+        - label: "I searched previous [Bug Reports](https://github.com/OpenAccess-AI-Collective/axolotl/labels/bug) didn't find any similar reports."
          required: true

  - type: textarea
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,7 +1,7 @@
 blank_issues_enabled: false
 contact_links:
  - name: Ask a question
-    url: https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/q-a
+    url: https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/q-a
    about: Ask questions and discuss with other community members
  - name: Discuss the Project in Discord
    url: https://discord.gg/HhrNrHJPRb
--- a/.github/ISSUE_TEMPLATE/docs.yml
+++ b/.github/ISSUE_TEMPLATE/docs.yml
@@ -10,7 +10,7 @@ body:
      value: |
        * Ask questions in [Discord](https://discord.gg/HhrNrHJPRb).
        * Before you file an issue read the [Contributing guide](./CONTRIBUTING.md).
-        * Check to make sure someone hasn't already opened a [similar issue](https://github.com/axolotl-ai-cloud/axolotl/issues).
+        * Check to make sure someone hasn't already opened a [similar issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues).
  - type: textarea
    attributes:
      label: What piece of documentation is affected?
--- a/.github/ISSUE_TEMPLATE/feature-request.yaml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yaml
@@ -8,9 +8,9 @@ body:
      label: "⚠️ Please check that this feature request hasn't been suggested before."
      description: "There are two locations for previous feature requests. Please search in both. Thank you. The **Label filters** may help make your search more focussed."
      options:
-        - label: "I searched previous [Ideas in Discussions](https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
+        - label: "I searched previous [Ideas in Discussions](https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
          required: true
-        - label: "I searched previous [Issues](https://github.com/axolotl-ai-cloud/axolotl/labels/enhancement) didn't find any similar feature requests."
+        - label: "I searched previous [Issues](https://github.com/OpenAccess-AI-Collective/axolotl/labels/enhancement) didn't find any similar feature requests."
          required: true

  - type: textarea
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,30 +5,32 @@ on:

 jobs:
  build-base:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: axolotl-gpu-runner
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: "121"
-            cuda_version: 12.1.1
-            cudnn_version: 8
+          - cuda: "118"
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "121"
-            cuda_version: 12.1.1
-            cudnn_version: 8
-            python_version: "3.11"
-            pytorch: 2.3.1
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
+          - cuda: "121"
+            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.1.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.2.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
    steps:
      - name: Checkout
@@ -55,7 +57,6 @@ jobs:
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
-            CUDNN_VERSION=${{ matrix.cudnn_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
       - '**.py'
       - 'requirements.txt'
       - '.github/workflows/*.yml'
-       - "*.[q]md"
+       - "*.md"
       - "examples/**/*.y[a]?ml"
  workflow_dispatch:

--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -8,26 +8,27 @@ on:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
-            axolotl_extras: mamba-ssm
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras: mamba-ssm
+            pytorch: 2.1.2
+            axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -59,32 +60,31 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
-            axolotl_extras:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
            axolotl_extras:
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -115,45 +115,3 @@ jobs:
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
-
-  build-axolotl-cloud-no-tmux:
-    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    # this job needs to be run on self-hosted GPU runners...
-    strategy:
-      matrix:
-        include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras:
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: winglian/axolotl-cloud-term
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      - name: Build
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            CUDA=${{ matrix.cuda }}
-          file: ./docker/Dockerfile-cloud-no-tmux
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -1,55 +0,0 @@
-name: docker-multigpu-tests-biweekly
-
-on:
-  pull_request:
-    paths:
-      - 'tests/e2e/multigpu/*.py'
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
-
-jobs:
-  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras:
-            num_gpus: 2
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras:
-            num_gpus: 2
-            nightly_build: "true"
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==0.63.64 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -7,26 +7,27 @@ on:

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
+            axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
+            is_latest: true
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
            axolotl_extras:
          - cuda: 121
-            cuda_version: 12.1.1
+            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras:
-            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -64,26 +65,26 @@ jobs:

  build-axolotl-cloud:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
-            axolotl_extras:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
            axolotl_extras:
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.2.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -1,120 +0,0 @@
-name: Tests Nightly against upstream main
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
-
-jobs:
-  pre-commit:
-    name: pre-commit
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.0
-        env:
-          SKIP: no-commit-to-branch
-
-  pytest:
-    name: PyTest
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.0"]
-    timeout-minutes: 20
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v3
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
-
-      - name: Update requirements.txt
-        run: |
-          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
-          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
-          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
-
-      - name: Install dependencies
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging
-          pip3 install -U -e .
-          pip3 install -r requirements-tests.txt
-
-      - name: Run tests
-        run: |
-          pytest --ignore=tests/e2e/ tests/
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 60
-    needs: [pre-commit, pytest]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.10"
-            pytorch: 2.3.1
-            num_gpus: 1
-            axolotl_extras: mamba-ssm
-            nightly_build: "true"
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            num_gpus: 1
-            axolotl_extras: mamba-ssm
-            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.4.0
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.10"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==0.63.64 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -26,8 +26,6 @@ jobs:
          python-version: "3.10"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.0
-        env:
-          SKIP: no-commit-to-branch

  pytest:
    name: PyTest
@@ -36,7 +34,6 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.0"]
    timeout-minutes: 20

    steps:
@@ -49,10 +46,6 @@ jobs:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies

-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }} --index-url https://download.pytorch.org/whl/cpu
-
      - name: Install dependencies
        run: |
          pip3 install --upgrade pip
@@ -64,12 +57,8 @@ jobs:
        run: |
          pytest --ignore=tests/e2e/ tests/

-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
+    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 60
@@ -79,24 +68,22 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 121
-            cuda_version: 12.1.1
+          - cuda: 118
+            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.3.1
+            pytorch: 2.1.2
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
            num_gpus: 1
-            axolotl_extras: mamba-ssm
          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.2
            num_gpus: 1
-            axolotl_extras: mamba-ssm
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 121
+            cuda_version: 12.1.0
            python_version: "3.11"
-            pytorch: 2.4.0
+            pytorch: 2.2.1
            num_gpus: 1
-            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -107,13 +94,12 @@ jobs:
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==0.63.64 jinja2
+          pip install modal jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
--- a/.gitignore
+++ b/.gitignore
@@ -133,7 +133,6 @@ venv/
 ENV/
 env.bak/
 venv.bak/
-venv3.10/

 # Spyder project settings
 .spyderproject
@@ -176,9 +175,3 @@ qlora-out/*
 mlruns/*

 /.quarto/
-prepared-datasets/
-submit.sh
-*.out*
-
-typings/
-out/
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -11,9 +11,6 @@ ignore_errors = True
 [mypy-axolotl.models.mixtral.*]
 ignore_errors = True

-[mypy-axolotl.integrations.liger.models.*]
-ignore_errors = True
-
 [mypy-axolotl.models.phi.*]
 ignore_errors = True

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -8,8 +8,6 @@ repos:
    -   id: check-yaml
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
-    -   id: no-commit-to-branch
-        args: ['--branch', 'main']
 -   repo: https://github.com/psf/black
    rev: 23.3.0
    hooks:
--- a/README.md
+++ b/README.md
@@ -1,9 +1,5 @@
 # Axolotl

-![tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg)
-![tests-nightly](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg)
-![multigpu-semi-weekly tests](https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg)
-
 Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.

 Features:
@@ -11,7 +7,7 @@ Features:
 - Supports fullfinetune, lora, qlora, relora, and gptq
 - Customize configurations using a simple yaml file or CLI overwrite
 - Load different dataset formats, use custom formats, or bring your own tokenized datasets
- Integrated with xformer, flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
+- Integrated with xformer, flash attention, rope scaling, and multipacking
 - Works with single GPU or multiple GPUs via FSDP or Deepspeed
 - Easily run with Docker locally or on the cloud
 - Log results and optionally checkpoints to wandb or mlflow
@@ -26,50 +22,37 @@ Features:
 <td>

 ## Table of Contents
- [Axolotl](#axolotl)
-  - [Table of Contents](#table-of-contents)
-  - [Axolotl supports](#axolotl-supports)
-  - [Quickstart ⚡](#quickstart-)
-    - [Usage](#usage)
-  - [Advanced Setup](#advanced-setup)
-    - [Environment](#environment)
-      - [Docker](#docker)
-      - [Conda/Pip venv](#condapip-venv)
-      - [Cloud GPU](#cloud-gpu)
-      - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
-        - [LambdaLabs](#lambdalabs)
-        - [GCP](#gcp)
-      - [Windows](#windows)
-      - [Mac](#mac)
-      - [Google Colab](#google-colab)
-      - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
-      - [Launching on public clouds via dstack](#launching-on-public-clouds-via-dstack)
-    - [Dataset](#dataset)
-    - [Config](#config)
-      - [All Config Options](#all-config-options)
-    - [Train](#train)
-      - [Preprocess dataset](#preprocess-dataset)
-      - [Multi-GPU](#multi-gpu)
-        - [DeepSpeed](#deepspeed)
-        - [FSDP](#fsdp)
-        - [FSDP + QLoRA](#fsdp--qlora)
-        - [Weights \& Biases Logging](#weights--biases-logging)
-        - [Special Tokens](#special-tokens)
-      - [Liger Kernel](#liger-kernel)
-    - [Inference Playground](#inference-playground)
-    - [Merge LORA to base](#merge-lora-to-base)
-  - [Common Errors 🧰](#common-errors-)
-    - [Tokenization Mismatch b/w Inference \& Training](#tokenization-mismatch-bw-inference--training)
-  - [Debugging Axolotl](#debugging-axolotl)
-  - [Need help? 🙋](#need-help-)
-  - [Badge ❤🏷️](#badge-️)
-  - [Community Showcase](#community-showcase)
-  - [Contributing 🤝](#contributing-)
-  - [Sponsors 🤝❤](#sponsors-)
-      - [💎 Diamond Sponsors - Contact directly](#-diamond-sponsors---contact-directly)
-      - [🥇 Gold Sponsors - $5000/mo](#-gold-sponsors---5000mo)
-      - [🥈 Silver Sponsors - $1000/mo](#-silver-sponsors---1000mo)
-      - [🥉 Bronze Sponsors - $500/mo](#-bronze-sponsors---500mo)
+- [Introduction](#axolotl)
+- [Supported Features](#axolotl-supports)
+- [Quickstart](#quickstart-)
+- [Environment](#environment)
+  - [Docker](#docker)
+  - [Conda/Pip venv](#condapip-venv)
+  - [Cloud GPU](#cloud-gpu) - Latitude.sh, JarvisLabs, RunPod
+  - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
+  - [Windows](#windows)
+  - [Mac](#mac)
+  - [Google Colab](#google-colab)
+  - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
+- [Dataset](#dataset)
+- [Config](#config)
+  - [Train](#train)
+  - [Inference](#inference-playground)
+  - [Merge LORA to Base](#merge-lora-to-base)
+  - [Special Tokens](#special-tokens)
+  - [All Config Options](#all-config-options)
+- Advanced Topics
+  - [Multipack](./docs/multipack.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [RLHF & DPO](./docs/rlhf.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [Dataset Pre-Processing](./docs/dataset_preprocessing.qmd)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+- [Common Errors](#common-errors-)
+  - [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
+- [Debugging Axolotl](#debugging-axolotl)
+- [Need Help?](#need-help-)
+- [Badge](#badge-)
+- [Community Showcase](#community-showcase)
+- [Contributing](#contributing-)
+- [Sponsors](#sponsors-)

 </td>
 <td>
@@ -83,8 +66,8 @@ Features:
    <p>
      Go ahead and Axolotl questions!!
    </p>
-    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
-    <img alt="PyTest Status" src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
+    <img src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/pre-commit.yml/badge.svg?branch=main" alt="pre-commit">
+    <img alt="PyTest Status" src="https://github.com/OpenAccess-AI-Collective/axolotl/actions/workflows/tests.yml/badge.svg?branch=main">
  </div>
 </div>

@@ -111,7 +94,6 @@ Features:
 | RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
 | Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
 | Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
-| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |

 ✅: supported
 ❌: not supported
@@ -124,7 +106,7 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo
 **Requirements**: Python >=3.10 and Pytorch >=2.1.1.

 ```bash
-git clone https://github.com/axolotl-ai-cloud/axolotl
+git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl

 pip3 install packaging ninja
@@ -141,15 +123,15 @@ accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml

 # inference
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
-    --lora_model_dir="./outputs/lora-out"
+    --lora_model_dir="./lora-out"

 # gradio
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
-    --lora_model_dir="./outputs/lora-out" --gradio
+    --lora_model_dir="./lora-out" --gradio

 # remote yaml files - the yaml config can be hosted on a public URL
 # Note: the yaml config must directly link to the **raw** yaml
-accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/openllama-3b/lora.yml
+accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/examples/openllama-3b/lora.yml
 ```

 ## Advanced Setup
@@ -310,47 +292,11 @@ HF_TOKEN=xx sky launch axolotl.yaml --env HF_TOKEN
 HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKEN --env BUCKET
 ```

-#### Launching on public clouds via dstack
-To launch on GPU instance (both on-demand and spot instances) on public clouds (GCP, AWS, Azure, Lambda Labs, TensorDock, Vast.ai, and CUDO), you can use [dstack](https://dstack.ai/).
-
-Write a job description in YAML as below:
-
-```yaml
-# dstack.yaml
-type: task
-
-image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2
-
-env:
-  - HUGGING_FACE_HUB_TOKEN
-  - WANDB_API_KEY
-
-commands:
-  - accelerate launch -m axolotl.cli.train config.yaml
-
-ports:
-  - 6006
-
-resources:
-  gpu:
-    memory: 24GB..
-    count: 2
-```
-
-then, simply run the job with `dstack run` command. Append `--spot` option if you want spot instance. `dstack run` command will show you the instance with cheapest price across multi cloud services:
-
-```bash
-pip install dstack
-HUGGING_FACE_HUB_TOKEN=xxx WANDB_API_KEY=xxx dstack run . -f dstack.yaml # --spot
-```
-
-For further and fine-grained use cases, please refer to the official [dstack documents](https://dstack.ai/docs/) and the detailed description of [axolotl example](https://github.com/dstackai/dstack/tree/master/examples/fine-tuning/axolotl) on the official repository.
-
 ### Dataset

 Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL.  The schema of the JSONL depends upon the task and the prompt template you wish to use.  Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.

-See [the documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.
+See [these docs](https://openaccess-ai-collective.github.io/axolotl/docs/dataset-formats/) for more information on how to use different dataset formats.

 ### Config

@@ -531,25 +477,6 @@ tokens: # these are delimiters

 When you include these tokens in your axolotl config, axolotl adds these tokens to the tokenizer's vocabulary.

-##### Liger Kernel
-
-Liger Kernel: Efficient Triton Kernels for LLM Training
-
-https://github.com/linkedin/Liger-Kernel
-
-Liger (LinkedIn GPU Efficient Runtime) Kernel is a collection of Triton kernels designed specifically for LLM training.
-It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%. The Liger Kernel
-composes well and is compatible with both FSDP and Deepspeed.
-
-```yaml
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-liger_rope: true
-liger_rms_norm: true
-liger_swiglu: true
-liger_fused_linear_cross_entropy: true
-```
-
 ### Inference Playground

 Axolotl allows you to load your model in an interactive terminal playground for quick experimentation.
@@ -645,7 +572,7 @@ If you decode a prompt constructed by axolotl, you might see spaces between toke
 3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines.  If they aren't the same, adjust your inference server accordingly.
 4. As an additional troubleshooting step, you can look at the token ids between 1 and 2 to make sure they are identical.

-Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this.  See [this blog post](https://hamel.dev/notes/llm/finetuning/05_tokenizer_gotchas.html) for a concrete example.
+Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this.  See [this blog post](https://hamel.dev/notes/llm/05_tokenizer_gotchas.html) for a concrete example.

 ## Debugging Axolotl

@@ -662,10 +589,10 @@ Need dedicated support? Please contact us at [✉️wing@openaccessaicollective.
 Building something cool with Axolotl? Consider adding a badge to your model card.

 ```markdown
-[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
 ```

-[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)

 ## Community Showcase

@@ -683,7 +610,7 @@ PocketDoc Labs

 Please read the [contributing guide](./.github/CONTRIBUTING.md)

-Bugs? Please check the [open issues](https://github.com/axolotl-ai-cloud/axolotl/issues/bug) else create a new Issue.
+Bugs? Please check the [open issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues/bug) else create a new Issue.

 PRs are **greatly welcome**!

@@ -701,7 +628,7 @@ pre-commit run --all-files

 Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.

-<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors">
+<a href="https://github.com/openaccess-ai-collective/axolotl/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
 </a>

--- a/_quarto.yml
+++ b/_quarto.yml
@@ -14,7 +14,7 @@ website:
    - icon: twitter
      href: https://twitter.com/axolotl_ai
    - icon: github
-      href: https://github.com/axolotl-ai-cloud/axolotl/
+      href: https://github.com/OpenAccess-AI-Collective/axolotl/
    - icon: discord
      href: https://discord.gg/7m9sfhzaf3

@@ -36,8 +36,6 @@ website:
            - docs/nccl.qmd
            - docs/mac.qmd
            - docs/multi-node.qmd
-            - docs/unsloth.qmd
-            - docs/amd_hpc.qmd
        - section: "Dataset Formats"
          contents: docs/dataset-formats/*
        - section: "Reference"
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -8,14 +8,13 @@ ENV BNB_CUDA_VERSION="{{ CUDA }}"
 ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
 ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"

 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev

 WORKDIR /workspace

-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git

 WORKDIR /workspace/axolotl

@@ -24,20 +23,14 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-    fi
-
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
-RUN pip install -r requirements-tests.txt
+RUN pip install pytest

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-set -e

 pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-pytest -n1 --dist loadfile -v /workspace/axolotl/tests/e2e/patched/ /workspace/axolotl/tests/e2e/integrations/
-pytest --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
+pytest /workspace/axolotl/tests/e2e/patched/
+pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -1,77 +0,0 @@
-"""
- modal application to run axolotl gpu tests in Modal
- """
-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import Image, Stub
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.3.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.3.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = (
-    Image.from_dockerfile(
-        pathlib.Path(temp_dir) / "Dockerfile",
-        force_build=True,
-        gpu="A10G",
-    )
-    .env(df_args)
-    .pip_install("fastapi==0.110.0", "pydantic==2.6.3")
-)
-
-stub = Stub("Axolotl CI/CD", secrets=[])
-
-
-N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = modal.gpu.H100(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
-
-
-@stub.function(
-    image=cicd_image,
-    gpu=GPU_CONFIG,
-    timeout=45 * 60,
-    cpu=8.0,
-    memory=131072 * N_GPUS,
-)
-def cicd_pytest():
-    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
-
-
-@stub.local_entrypoint()
-def main():
-    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -1,5 +0,0 @@
-#!/bin/bash
-set -e
-
-# only run one test at a time so as not to OOM the GPU
-pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -1,8 +1,6 @@
 """
 modal application to run axolotl gpu tests in Modal
 """
-# pylint: disable=duplicate-code
-
 import os
 import pathlib
 import tempfile
@@ -23,12 +21,11 @@ df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.3.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.3.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
+    "CUDA": os.environ.get("CUDA", "118"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
 }

 dockerfile_contents = df_template.render(**df_args)
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -11,20 +11,20 @@ ARG PYTORCH_VERSION="2.1.2"
 ENV PYTORCH_VERSION=$PYTORCH_VERSION

 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev

 WORKDIR /workspace

-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git

 WORKDIR /workspace/axolotl

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install -e .[deepspeed,flash-attn,optimizers] $AXOLOTL_ARGS; \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,galore] $AXOLOTL_ARGS; \
    fi

 # So we can test the Docker image
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -3,7 +3,7 @@ ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4

-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder

 ENV PATH="/root/miniconda3/bin:${PATH}"

--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -3,6 +3,7 @@ FROM winglian/axolotl:$BASE_TAG

 ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"

--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -1,26 +0,0 @@
-ARG BASE_TAG=main
-FROM winglian/axolotl:$BASE_TAG
-
-ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
-ENV HF_HOME="/workspace/data/huggingface-cache/hub"
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-EXPOSE 8888
-EXPOSE 22
-
-COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
-COPY scripts/motd /etc/motd
-
-RUN pip install jupyterlab notebook ipywidgets && \
-    jupyter lab clean
-RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
-    pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
-    mkdir -p ~/.ssh && \
-    chmod 700 ~/.ssh && \
-    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
-    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
-    chmod +x /root/cloud-entrypoint.sh
-
-ENTRYPOINT ["/root/cloud-entrypoint.sh"]
-CMD ["sleep", "infinity"]
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -16,7 +16,7 @@ RUN apt-get update && \

 WORKDIR /workspace

-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git

 WORKDIR /workspace/axolotl

--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -1,108 +0,0 @@
---
-title: Training with AMD GPUs on HPC Systems
-description: A comprehensive guide for using Axolotl on distributed systems with AMD GPUs
---
-
-This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.
-
-## Setup
-
-### 1. Install Python
-
-We recommend using Miniforge, a minimal conda-based Python distribution:
-
-```bash
-curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-bash Miniforge3-$(uname)-$(uname -m).sh
-```
-
-### 2. Configure Python Environment
-Add Python to your PATH and ensure it's available at login:
-
-```bash
-echo 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc
-echo 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile
-```
-
-### 3. Load AMD GPU Software
-
-Load the ROCm module:
-
-```bash
-module load rocm/5.7.1
-```
-
-Note: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.
-
-### 4. Install PyTorch
-
-Install PyTorch with ROCm support:
-
-```bash
-pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall
-```
-
-### 5. Install Flash Attention
-
-Clone and install the Flash Attention repository:
-
-```bash
-git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git
-export GPU_ARCHS="gfx90a"
-cd flash-attention
-export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
-patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
-pip install .
-```
-
-### 6. Install Axolotl
-
-Clone and install Axolotl:
-
-```bash
-git clone https://github.com/axolotl-ai-cloud/axolotl
-cd axolotl
-pip install packaging ninja
-pip install -e .
-```
-
-### 7. Apply xformers Workaround
-
-xformers appears to be incompatible with ROCm. Apply the following workarounds:
- - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return `False` for SwiGLU availability from xformers.
- - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the "SwiGLU" function with a pass statement.
-
-### 8. Prepare Job Submission Script
-
-Create a script for job submission using your HPC's particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include
-
-```bash
-export TRANSFORMERS_OFFLINE=1
-export HF_DATASETS_OFFLINE=1
-```
-
-### 9. Download Base Model
-
-Download a base model using the Hugging Face CLI:
-
-```bash
-huggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
-```
-
-### 10. Create Axolotl Configuration
-
-Create an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.
-
-Note: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.
-
-### 11. Preprocess Data
-
-Run preprocessing on the login node:
-
-```bash
-CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess /path/to/your/config.yaml
-```
-
-### 12. Train
-
-You are now ready to submit your previously prepared job script. 🚂
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -138,7 +138,7 @@ test_datasets:
    data_files:
      - /workspace/data/eval.jsonl

-# use RL training: 'dpo', 'ipo', 'kto'
+# use RL training: 'dpo', 'ipo', 'kto_pair'
 rl:

 # Saves the desired chat template to the tokenizer_config.json for easier inferencing
@@ -186,11 +186,6 @@ eval_sample_packing:
 # The trainer will provide recommended values for these values.
 sample_packing_eff_est:
 total_num_tokens:
-# Increasing the following values helps with packing, but usually only slightly (<%1.)
-# The number of samples packed at a time.
-sample_packing_group_size: 100000
-# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
-sample_packing_bin_size: 200

 # Passed through to transformers when loading the model when launched without accelerate
 # Use `sequential` when training w/ model parallelism to limit memory
@@ -232,12 +227,6 @@ lora_modules_to_save:

 lora_fan_in_fan_out: false

-# LoRA+ hyperparameters
-# For more details about the following options, see:
-# https://arxiv.org/abs/2402.12354  and `src/axolotl/core/train_builder.py`
-loraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
-loraplus_lr_embedding: #  loraplus learning rate for lora embedding layers. Default value is 1e-6.
-
 peft:
  # Configuration options for loftq initialization for LoRA
  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
@@ -279,7 +268,6 @@ torch_compile_backend:  # Optional[str]
 # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
 gradient_accumulation_steps: 1
 # The number of samples to include in each batch. This is the number of samples sent to each GPU.
-# Batch size per gpu = micro_batch_size * gradient_accumulation_steps
 micro_batch_size: 2
 eval_batch_size:
 num_epochs: 4
@@ -290,7 +278,7 @@ lr_quadratic_warmup:
 logging_steps:
 eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
 evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
-save_strategy: # Set to `"no"` to skip checkpoint saves
+save_strategy: # Set to `no` to skip checkpoint saves
 save_steps: # Leave empty to save at each epoch
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -54,14 +54,6 @@ conversations where `from` is `prompter` `assistant` instead of default sharegpt
 {"conversations": [{"from": "...", "value": "..."}]}
 ```

-## sharegpt.load_ultrachat
-
-conversations where the turns field is 'messages', human is 'user' and gpt is 'assistant'.
-
-```{.json filename="data.jsonl"}
-{"messages": [{"user": "...", "assistant": "..."}]}
-```
-
 ## sharegpt_jokes

 creates a chat where bot is asked to tell a joke, then explain why the joke is funny
--- a/docs/dataset-formats/tokenized.qmd
+++ b/docs/dataset-formats/tokenized.qmd
@@ -4,25 +4,9 @@ description: How to use a custom pre-tokenized dataset.
 order: 5
 ---

- Pass an empty `type:` in your axolotl config.
+- Do not pass a `type:` in your axolotl config.
 - Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
- To indicate that a token should be ignored during training, set its corresponding label to `-100`.
- You must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.
- For pretraining, do not truncate/pad documents to the context window length.
- For instruction training, documents must be truncated/padded as desired.
-
-Sample config:

 ```{.yaml filename="config.yml"}
-datasets:
-  - path: /path/to/your/file.jsonl
-    ds_type: json
-    type:
-```
-
-Sample jsonl:
-
-```jsonl
-{"input_ids":[271,299,99],"attention_mask":[1,1,1],"labels":[271,-100,99]}
-{"input_ids":[87,227,8383,12],"attention_mask":[1,1,1,1],"labels":[87,227,8383,12]}
+- path: ...
 ```
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -192,7 +192,7 @@ Using [official Axolotl Docker images](https://hub.docker.com/r/winglian/axolotl
 On the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:

 ```bash
-git clone https://github.com/axolotl-ai-cloud/axolotl
+git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl
 ```

--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -20,7 +20,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
 > See the [example config](#example-config) file in addition to reading these instructions.

 1. Set `adapter: qlora` in your axolotl config file.
-2. Enable FSDP in your axolotl config, as [described here](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#fsdp).
+2. Enable FSDP in your axolotl config, as [described here](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#fsdp).
 3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.

 ## Example Config
@@ -29,7 +29,7 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:

 ## References

- [PR #1378](https://github.com/axolotl-ai-cloud/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl.
+- [PR #1378](https://github.com/OpenAccess-AI-Collective/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl.
 - [Blog Post](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the [Answer.AI](https://www.answer.ai/) team describing the work that enabled QLoRA in FSDP.
 - Related HuggingFace PRs Enabling FDSP + QLoRA:
    - Accelerate [PR#2544](https://github.com/huggingface/accelerate/pull/2544 )
--- a/docs/input_output.qmd
+++ b/docs/input_output.qmd
@@ -25,7 +25,7 @@ description: "Template-free prompt construction with the `input_output` format"
 ### Masking Inputs

 One of the most popular features of
-[axolotl](https://github.com/axolotl-ai-cloud/axolotl) is
+[axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) is
 setting the following configuration value:


@@ -33,7 +33,7 @@ setting the following configuration value:
 train_on_inputs: false
 ```

-If you declare a [dataset formats](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#dataset)
+If you declare a [dataset formats](https://github.com/OpenAccess-AI-Collective/axolotl?tab=readme-ov-file#dataset)
 such as `alpaca` or `chatml`, axolotl knows what is an input
 (i.e. human) vs. an output (i.e. the assistant) and masks the input
 labels so that your model can focus on predicting the outputs only.
@@ -205,7 +205,7 @@ ds = load_from_disk(f'last_run_prepared/{directory[0]}/')
    hi there!.  goodbye  farewell</s>
 ```

-We can check that the right tokens are ignored by comparing the labels
+We can check that the right tokens are ingored by comparing the labels
 to each token:

 ```python
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -1,28 +0,0 @@
-# MultiModal / Vision Language Models (BETA)
-
-### Supported Models
-
- Mllama, i.e. llama with vision models
-
-### Usage
-
-Currently multimodal support is limited and doesn't have full feature parity. To finetune a multimodal Llama w/ LoRA,
-you'll need to use the following in YAML in combination with the rest of the required hyperparams.
-
-```yaml
-base_model: alpindale/Llama-3.2-11B-Vision-Instruct
-processor_type: AutoProcessor
-skip_prepare_dataset: true
-
-chat_template: llama3_2_vision
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-remove_unused_columns: false
-sample_packing: false
-
-# only finetune the Language model, leave the vision model and vision tower frozen
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-```
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -49,7 +49,7 @@ remove_unused_columns: false
 chat_template: chatml
 datasets:
  - path: argilla/ultrafeedback-binarized-preferences-cleaned
-    type: chat_template.argilla
+    type: orpo.chat_template
 ```

 #### Using local dataset files
--- a/docs/torchao.qmd
+++ b/docs/torchao.qmd
@@ -1,19 +0,0 @@
---
-title: "PyTorch ao"
-description: "Custom data types and layouts for training and inference"
---
-
-### Installation
-
-Stable Release from the PyTorch index
-
-```bash
-pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124
-```
-
-
-Nightly release
-
-```bash
-pip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124
-```
--- a/docs/unsloth.qmd
+++ b/docs/unsloth.qmd
@@ -1,49 +0,0 @@
---
-title: "Unsloth"
-description: "Hyper-optimized QLoRA finetuning for single GPUs"
---
-
-### Overview
-
-Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
-standard industry baselines.
-
-
-### Installation
-
-The following will install unsloth from source and downgrade xformers as unsloth is incompatible with the most up
-to date libraries.
-
-```bash
-pip install --no-deps "unsloth @ git+https://github.com/unslothai/unsloth.git"
-pip install --no-deps --force-reinstall xformers==0.0.26.post1
-```
-
-### Using unsloth w Axolotl
-
-Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.
-
-Our unsloth integration is currently limited to the following model architectures:
- - llama
-
-These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
-```yaml
-unsloth_lora_mlp: true
-unsloth_lora_qkv: true
-unsloth_lora_o: true
-```
-
-These options are composable and can be used with multi-gpu finetuning
-```yaml
-unsloth_cross_entropy_loss: true
-unsloth_rms_norm: true
-unsloth_rope: true
-```
-
-### Limitations
-
- Single GPU only; e.g. no multi-gpu support
- No deepspeed or FSDP support (requires multi-gpu)
- LoRA + QLoRA support only. No full fine tunes or fp8 support.
- Limited model architecture support. Llama, Phi, Gemma, Mistral only
- No MoE support.
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -38,7 +38,7 @@ wandb_watch:
 wandb_name:
 wandb_log_model:

-output_dir: ./outputs/btlm-out
+output_dir: btlm-out
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -25,7 +25,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out
 batch_size: 4
 micro_batch_size: 4
 num_epochs: 2
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -1,222 +1,216 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "AKjdG7tbTb-n"
-   },
-   "source": [
-    "# Example notebook for running Axolotl on google colab"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "RcbNpOgWRcii"
-   },
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
-    "assert (torch.cuda.is_available()==True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "h3nLav8oTRA5"
-   },
-   "source": [
-    "## Install Axolotl and dependencies"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AKjdG7tbTb-n"
+      },
+      "source": [
+        "# Example notebook for running Axolotl on google colab"
+      ]
    },
-    "id": "3c3yGAwnOIdi",
-    "outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
-   },
-   "outputs": [],
-   "source": [
-    "!pip install -e git+https://github.com/axolotl-ai-cloud/axolotl#egg=axolotl\n",
-    "!pip install flash-attn==\"2.5.0\"\n",
-    "!pip install deepspeed==\"0.13.1\"!pip install mlflow==\"2.13.0\""
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "BW2MFr7HTjub"
-   },
-   "source": [
-    "## Create an yaml config file"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "9pkF2dSoQEUN"
-   },
-   "outputs": [],
-   "source": [
-    "import yaml\n",
-    "\n",
-    "# Your YAML string\n",
-    "yaml_string = \"\"\"\n",
-    "base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
-    "model_type: LlamaForCausalLM\n",
-    "tokenizer_type: LlamaTokenizer\n",
-    "\n",
-    "load_in_8bit: false\n",
-    "load_in_4bit: true\n",
-    "strict: false\n",
-    "\n",
-    "datasets:\n",
-    "  - path: mhenrichsen/alpaca_2k_test\n",
-    "    type: alpaca\n",
-    "dataset_prepared_path:\n",
-    "val_set_size: 0.05\n",
-    "output_dir: ./outputs/qlora-out\n",
-    "\n",
-    "adapter: qlora\n",
-    "lora_model_dir:\n",
-    "\n",
-    "sequence_len: 4096\n",
-    "sample_packing: true\n",
-    "eval_sample_packing: false\n",
-    "pad_to_sequence_len: true\n",
-    "\n",
-    "lora_r: 32\n",
-    "lora_alpha: 16\n",
-    "lora_dropout: 0.05\n",
-    "lora_target_modules:\n",
-    "lora_target_linear: true\n",
-    "lora_fan_in_fan_out:\n",
-    "\n",
-    "wandb_project:\n",
-    "wandb_entity:\n",
-    "wandb_watch:\n",
-    "wandb_name:\n",
-    "wandb_log_model:\n",
-    "\n",
-    "gradient_accumulation_steps: 4\n",
-    "micro_batch_size: 2\n",
-    "num_epochs: 4\n",
-    "optimizer: paged_adamw_32bit\n",
-    "lr_scheduler: cosine\n",
-    "learning_rate: 0.0002\n",
-    "\n",
-    "train_on_inputs: false\n",
-    "group_by_length: false\n",
-    "bf16: auto\n",
-    "fp16:\n",
-    "tf32: false\n",
-    "\n",
-    "gradient_checkpointing: true\n",
-    "early_stopping_patience:\n",
-    "resume_from_checkpoint:\n",
-    "local_rank:\n",
-    "logging_steps: 1\n",
-    "xformers_attention:\n",
-    "flash_attention: true\n",
-    "\n",
-    "warmup_steps: 10\n",
-    "evals_per_epoch: 4\n",
-    "saves_per_epoch: 1\n",
-    "debug:\n",
-    "deepspeed:\n",
-    "weight_decay: 0.0\n",
-    "fsdp:\n",
-    "fsdp_config:\n",
-    "special_tokens:\n",
-    "\n",
-    "\"\"\"\n",
-    "\n",
-    "# Convert the YAML string to a Python dictionary\n",
-    "yaml_dict = yaml.safe_load(yaml_string)\n",
-    "\n",
-    "# Specify your file path\n",
-    "file_path = 'test_axolotl.yaml'\n",
-    "\n",
-    "# Write the YAML file\n",
-    "with open(file_path, 'w') as file:\n",
-    "    yaml.dump(yaml_dict, file)\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "bidoj8YLTusD"
-   },
-   "source": [
-    "## Launch the training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RcbNpOgWRcii"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
+        "assert (torch.cuda.is_available()==True)"
+      ]
    },
-    "id": "ydTI2Jk2RStU",
-    "outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
-   },
-   "outputs": [],
-   "source": [
-    "# By using the ! the comand will be executed as a bash command\n",
-    "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
-   ]
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "h3nLav8oTRA5"
+      },
+      "source": [
+        "## Install Axolotl and dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3c3yGAwnOIdi",
+        "outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install torch==\"2.1.2\"\n",
+        "!pip install -e git+https://github.com/OpenAccess-AI-Collective/axolotl#egg=axolotl\n",
+        "!pip install flash-attn==\"2.5.0\"\n",
+        "!pip install deepspeed==\"0.13.1\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BW2MFr7HTjub"
+      },
+      "source": [
+        "## Create an yaml config file"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9pkF2dSoQEUN"
+      },
+      "outputs": [],
+      "source": [
+        "import yaml\n",
+        "\n",
+        "# Your YAML string\n",
+        "yaml_string = \"\"\"\n",
+        "base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
+        "model_type: LlamaForCausalLM\n",
+        "tokenizer_type: LlamaTokenizer\n",
+        "is_llama_derived_model: true\n",
+        "\n",
+        "load_in_8bit: false\n",
+        "load_in_4bit: true\n",
+        "strict: false\n",
+        "\n",
+        "datasets:\n",
+        "  - path: mhenrichsen/alpaca_2k_test\n",
+        "    type: alpaca\n",
+        "dataset_prepared_path:\n",
+        "val_set_size: 0.05\n",
+        "output_dir: ./qlora-out\n",
+        "\n",
+        "adapter: qlora\n",
+        "lora_model_dir:\n",
+        "\n",
+        "sequence_len: 1096\n",
+        "sample_packing: true\n",
+        "pad_to_sequence_len: true\n",
+        "\n",
+        "lora_r: 32\n",
+        "lora_alpha: 16\n",
+        "lora_dropout: 0.05\n",
+        "lora_target_modules:\n",
+        "lora_target_linear: true\n",
+        "lora_fan_in_fan_out:\n",
+        "\n",
+        "wandb_project:\n",
+        "wandb_entity:\n",
+        "wandb_watch:\n",
+        "wandb_name:\n",
+        "wandb_log_model:\n",
+        "\n",
+        "mlflow_experiment_name: colab-example\n",
+        "\n",
+        "gradient_accumulation_steps: 1\n",
+        "micro_batch_size: 1\n",
+        "num_epochs: 4\n",
+        "max_steps: 20\n",
+        "optimizer: paged_adamw_32bit\n",
+        "lr_scheduler: cosine\n",
+        "learning_rate: 0.0002\n",
+        "\n",
+        "train_on_inputs: false\n",
+        "group_by_length: false\n",
+        "bf16: false\n",
+        "fp16: true\n",
+        "tf32: false\n",
+        "\n",
+        "gradient_checkpointing: true\n",
+        "early_stopping_patience:\n",
+        "resume_from_checkpoint:\n",
+        "local_rank:\n",
+        "logging_steps: 1\n",
+        "xformers_attention:\n",
+        "flash_attention: false\n",
+        "\n",
+        "warmup_steps: 10\n",
+        "evals_per_epoch:\n",
+        "saves_per_epoch:\n",
+        "debug:\n",
+        "deepspeed:\n",
+        "weight_decay: 0.0\n",
+        "fsdp:\n",
+        "fsdp_config:\n",
+        "special_tokens:\n",
+        "\n",
+        "\"\"\"\n",
+        "\n",
+        "# Convert the YAML string to a Python dictionary\n",
+        "yaml_dict = yaml.safe_load(yaml_string)\n",
+        "\n",
+        "# Specify your file path\n",
+        "file_path = 'test_axolotl.yaml'\n",
+        "\n",
+        "# Write the YAML file\n",
+        "with open(file_path, 'w') as file:\n",
+        "    yaml.dump(yaml_dict, file)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bidoj8YLTusD"
+      },
+      "source": [
+        "## Launch the training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ydTI2Jk2RStU",
+        "outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
+      },
+      "outputs": [],
+      "source": [
+        "# Buy using the ! the comand will be executed as a bash command\n",
+        "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Play with inference"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Buy using the ! the comand will be executed as a bash command\n",
+        "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
+        "    --qlora_model_dir=\"./qlora-out\" --gradio"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Play with inference"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# By using the ! the comand will be executed as a bash command\n",
-    "!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n",
-    "    --qlora_model_dir=\"./qlora-out\" --gradio"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.1"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
+  "nbformat": 4,
+  "nbformat_minor": 0
 }
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -10,7 +10,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 512
 sample_packing: false
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -10,7 +10,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 512
 sample_packing: false
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -10,7 +10,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 512
 sample_packing: false
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -1,67 +0,0 @@
-base_model: deepseek-ai/DeepSeek-V2-Lite
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 100
-evals_per_epoch: 2
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-special_tokens:
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -1,83 +0,0 @@
-base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
-trust_remote_code: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-liger_rms_norm: true
-liger_swiglu: true
-liger_fused_linear_cross_entropy: true
-
-chat_template: deepseek_v2
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-adapter: qlora
-lora_r: 256
-lora_alpha: 256
-lora_target_linear: true
-peft_use_rslora: true
-
-gradient_accumulation_steps: 1
-micro_batch_size: 8
-num_epochs: 1
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 100
-evals_per_epoch: 2
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-special_tokens:
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -28,7 +28,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/falcon-7b
+output_dir: ./falcon-7b
 batch_size: 2
 micro_batch_size: 1
 num_epochs: 4
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -42,7 +42,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 # QLoRA paper Table 9
 # - 16 for 7b & 13b
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -28,7 +28,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/falcon-7b
+output_dir: ./falcon-7b
 batch_size: 2
 micro_batch_size: 1
 num_epochs: 4
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -12,7 +12,7 @@ datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 val_set_size: 0.1
-output_dir: ./outputs/out
+output_dir: ./out

 adapter: qlora
 lora_r: 32
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -1,68 +0,0 @@
-base_model: google/gemma-2-9b
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-# huggingface repo
-chat_template: gemma
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    chat_template: gemma
-    drop_system_message: true
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch:
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -23,7 +23,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 2
--- a/examples/jamba/README.md
+++ b/examples/jamba/README.md
@@ -6,5 +6,5 @@
 - ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
 - ✅ qlora single-gpu, ~51GiB VRAM
 - ✅ multipack
- ✅ FSDP
+- ❓ FSDP
 - ❓ 8-bit LoRA
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -10,7 +10,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.0
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 4096
 sample_packing: false
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -10,7 +10,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.0
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 4096
 sample_packing: false
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -1,61 +0,0 @@
-base_model: ai21labs/AI21-Jamba-1.5-Large
-tokenizer_type: AutoTokenizer
-
-load_in_4bit: true
-strict: false
-use_tensorboard: true
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    chat_template: jamba
-    drop_system_message: true
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: jamba-large-fsdp-qlora-ft
-save_safetensors: true
-adapter: qlora
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 16
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: [down_proj,gate_proj,in_proj,k_proj,o_proj,out_proj,q_proj,up_proj,v_proj,x_proj]
-lora_target_linear: false
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 2
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 0.00001
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: true
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 1
-saves_per_epoch: 1
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: false
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -21,7 +21,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/jeopardy-bot-7b
+output_dir: ./jeopardy-bot-7b
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 4
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 4096
 sample_packing: true
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -33,7 +33,7 @@ wandb_project:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/model-out
+output_dir: ./model-out
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 4
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-output_dir: ./outputs/lisa-out
+output_dir: ./lisa-out

 sequence_len: 4096
 sample_packing: true
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -12,7 +12,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/relora-out
+output_dir: ./relora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -1,63 +0,0 @@
-base_model: alpindale/Llama-3.2-11B-Vision-Instruct
-processor_type: AutoProcessor
-strict: false
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: llama3_2_vision
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 8192
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-local_rank:
-logging_steps: 1
-flash_attention: true
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -1,76 +0,0 @@
-base_model: NousResearch/Meta-Llama-3.1-8B
-
-plugins:
-  - axolotl.integrations.liger.LigerPlugin
-liger_rope: true
-liger_rms_norm: true
-liger_swiglu: true
-liger_fused_linear_cross_entropy: true
-
-strict: false
-
-chat_template: llama3
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.02
-output_dir: ./outputs/out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 100
-evals_per_epoch: 2
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_backward_prefetch: BACKWARD_PRE
-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-  eos_token: <|eot_id|>
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,4 +1,6 @@
-base_model: NousResearch/Meta-Llama-3.1-8B
+base_model: meta-llama/Meta-Llama-3-8B
+model_type: LlamaForCausalLM
+tokenizer_type: AutoTokenizer

 load_in_8bit: false
 load_in_4bit: false
@@ -9,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 8192
 sample_packing: true
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -1,81 +0,0 @@
-base_model: meta-llama/Meta-Llama-3-8B-Instruct
-model_type: LlamaForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-chat_template: llama3
-rl: dpo
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_dpo_test
-    type: chat_template.default
-    chat_template: llama3
-    field_messages: conversation
-    field_chosen: chosen
-    field_rejected: rejected
-    message_field_role: role
-    message_field_content: content
-    roles:
-      system:
-        - system
-      user:
-        - user
-      assistant:
-        - assistant
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-s2_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -1,78 +0,0 @@
-base_model: NousResearch/Meta-Llama-3-8B-Instruct
-model_type: LlamaForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-chat_template: llama3
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-    chat_template: llama3
-    field_messages: messages
-    message_field_role: role
-    message_field_content: content
-    roles:
-      user:
-        - user
-      assistant:
-        - assistant
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-s2_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-   pad_token: <|end_of_text|>
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -1,4 +1,4 @@
-base_model: NousResearch/Meta-Llama-3-8B
+base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer

@@ -11,11 +11,10 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out

 sequence_len: 4096
 sample_packing: true
-eval_sample_packing: false
 pad_to_sequence_len: true

 adapter: lora
@@ -25,9 +24,6 @@ lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
-lora_modules_to_save:
-  - embed_tokens
-  - lm_head

 wandb_project:
 wandb_entity:
@@ -68,4 +64,4 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
-   pad_token: <|end_of_text|>
+  pad_token: <|end_of_text|>
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -1,63 +0,0 @@
-base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16
-tokenizer_type: AutoTokenizer
-
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out/qlora-llama3_1-405b
-save_safetensors: true
-
-adapter: qlora
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 16
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 2
-optimizer: adamw_torch
-lr_scheduler: cosine
-learning_rate: 0.00001
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: true
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-evals_per_epoch: 4
-saves_per_epoch: 1
-weight_decay: 0.0
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_limit_all_gathers: true
-  fsdp_sync_module_states: true
-  fsdp_offload_params: true
-  fsdp_use_orig_params: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sharding_strategy: FULL_SHARD
-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-output_dir: ./outputs/out/qlora-llama3-70b
+output_dir: ./out/qlora-llama3-70b

 adapter: qlora
 lora_model_dir:
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -1,4 +1,4 @@
-base_model: NousResearch/Meta-Llama-3-8B
+base_model: meta-llama/Meta-Llama-3-8B
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer

@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -12,7 +12,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.0
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 2048
 sample_packing: false
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -23,7 +23,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 2048
 sample_packing: true
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 8192
 sample_packing: true
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out
 eval_sample_packing: false

 adapter: lora
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out

 adapter: lora
 lora_model_dir:
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -12,7 +12,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.02
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 model_config:
  output_router_logits: true
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -1,82 +0,0 @@
-base_model: mistralai/Mistral-7B-v0.1
-model_type: MistralForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-rl: orpo
-orpo_alpha: 0.1
-remove_unused_columns: false
-
-chat_template: chatml
-datasets:
-  - path: argilla/ultrafeedback-binarized-preferences-cleaned
-    type: chat_template.argilla
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/mistral-qlora-orpo-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-loss_watchdog_threshold: 5.0
-loss_watchdog_patience: 3
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.02
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 model_config:
  output_router_logits: true
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -12,7 +12,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.02
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 model_config:
  output_router_logits: true
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -12,7 +12,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 ## You can optionally freeze the entire model and unfreeze a subset of parameters
 unfrozen_parameters:
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -21,7 +21,7 @@ model_config:
 datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
-output_dir: ./outputs/out
+output_dir: ./out

 sequence_len: 8000
 sample_packing: true
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -11,7 +11,7 @@ datasets:
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out

 adapter: qlora
 lora_model_dir:
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -23,7 +23,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/mpt-alpaca-7b
+output_dir: ./mpt-alpaca-7b
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 4
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -25,7 +25,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/openllama-out
+output_dir: ./openllama-out
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 4
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -31,7 +31,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/lora-out
+output_dir: ./lora-out
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 4
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -25,7 +25,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./outputs/qlora-out
+output_dir: ./qlora-out
 gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 4
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -1,76 +0,0 @@
-base_model: microsoft/Phi-3.5-mini-instruct
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-chat_template: phi_3
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-    chat_template: phi_3
-    field_messages: messages
-    message_field_role: role
-    message_field_content: content
-    roles:
-      user:
-        - user
-      assistant:
-        - assistant
-
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 4
-num_epochs: 2
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bfloat16: true
-bf16: true
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-s2_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 4
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -12,7 +12,7 @@ datasets:

 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/phi-sft-out
+output_dir: ./phi-sft-out

 sequence_len: 2048
 sample_packing: true
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -12,7 +12,7 @@ datasets:

 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/phi-sft-out
+output_dir: ./phi-sft-out

 sequence_len: 2048
 sample_packing: true
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -12,7 +12,7 @@ datasets:

 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./outputs/phi-sft-out
+output_dir: ./phi-sft-out

 sequence_len: 2048
 sample_packing: true
--- a/Show More
+++ b/Show More