pre-commit formatting fixes

Whitespace bug fix
Command had accidentally been moved out of if-else block.
2023-08-05 22:46:02 -04:00 · 2023-08-05 15:08:44 +12:00 · 2023-08-05 11:45:12 +12:00 · 2023-08-05 11:01:44 +12:00 · 2023-08-04 12:12:05 -04:00 · 2023-08-03 16:25:30 -07:00
74 changed files with 4138 additions and 949 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -12,28 +12,19 @@ jobs:
    # this job needs to be run on self-hosted GPU runners...
    runs-on: self-hosted
    strategy:
+      fail-fast: false
      matrix:
        include:
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
-            axolotl_extras:
+            pytorch: 2.0.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.0
-            axolotl_extras:
-          - cuda: "117"
-            cuda_version: 11.7.0
-            python_version: "3.9"
-            pytorch: 1.13.1
-            axolotl_extras:
-          - cuda: "118"
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.0
-            axolotl_extras: gptq
+            pytorch: 2.0.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -57,11 +48,9 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras }}
+            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,28 +11,24 @@ jobs:
    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
    strategy:
+      fail-fast: false
      matrix:
        include:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras: gptq
-          - cuda: cu117
-            cuda_version: 11.7.0
-            python_version: "3.9"
-            pytorch: 1.13.1
-            axolotl_extras:
    runs-on: self-hosted
    steps:
      - name: Checkout
@@ -54,13 +50,11 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
  build-axolotl-runpod:
    needs: build-axolotl
    if: github.repository_owner == 'OpenAccess-AI-Collective'
@@ -68,26 +62,21 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: cu118
+          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
-          - cuda: cu118
+          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
-          - cuda: cu118
+          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras: gptq
-          - cuda: cu117
-            cuda_version: 11.7.0
-            python_version: "3.9"
-            pytorch: 1.13.1
-            axolotl_extras:
    runs-on: self-hosted
    steps:
      - name: Checkout
@@ -109,10 +98,9 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-runpod
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -7,6 +7,7 @@ jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
+      fail-fast: false
      matrix:
        python_version: ["3.9", "3.10"]
    timeout-minutes: 10
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 default_language_version:
-    python: python3.9
+    python: python3

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
--- a/FAQS.md
+++ b/FAQS.md
@@ -2,3 +2,6 @@

 - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
 - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
+- `Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c`
+`/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.`
+This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.
--- a/202
+++ b/202
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -16,31 +16,32 @@

 ## Axolotl supports

-|         | fp16/fp32 | fp16/fp32 w/ lora | qlora | 4bit-quant | 4bit-quant w/flash attention | flash attention | xformers attention |
-|---------|:----------|:------------------|------|------------|------------------------------|-----------------|--------------------|
-| llama   | ✅         | ✅                 | ✅  | ✅          | ✅                            | ✅               | ✅                  |
-| Pythia  | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
-| cerebras | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
-| mpt     | ✅         | ❌                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
-| falcon  | ✅         | ❌                 | ❌  | ❌          | ❌                            | ❌               | ❓                  |
+|          | fp16/fp32 | lora | qlora | gptq | gptq w/ lora | gptq w/flash attn | flash attn | xformers attn |
+|----------|:----------|:-----|-------|------|:-------------|-------------------|------------|---------------|
+| llama    | ✅         | ✅    | ✅     | ✅    | ✅             | ✅                 | ✅          | ✅             |
+| Pythia   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ❓             |
+| cerebras | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
+| mpt      | ✅         | ❌    | ❓     | ❌    | ❓            | ❌                 | ❌          | ❓             |
+| falcon   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
+| gpt-j    | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❓          | ✅             |
+| XGen     | ✅         | ❓    | ✅     | ❓    | ❓            | ❓                 | ❓          | ✅


 ## Quickstart ⚡

-**Requirements**: Python 3.9 and Pytorch 2.0.
+**Requirements**: Python >=3.9 and Pytorch >=2.0.

 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl

 pip3 install -e .
-
-accelerate config
+pip3 install -U git+https://github.com/huggingface/peft.git

 # finetune lora
-accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml

 # inference
-accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml \
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
    --inference --lora_model_dir="./lora-out"
 ```

@@ -50,10 +51,16 @@ accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml \

 - Docker
  ```bash
-  docker run --gpus '"all"' --rm -it winglian/axolotl:main
+  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
+  ```
+  - `winglian/axolotl-runpod:main-py3.10-cu118-2.0.1`: for runpod
+  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.1-gptq`: for gptq
+
+  Or run on the current files for development:
+
+  ```sh
+  docker compose up -d
  ```
-  - `winglian/axolotl:dev`: dev branch
-  - `winglian/axolotl-runpod:main`: for runpod

 - Conda/Pip venv
  1. Install python **3.9**
@@ -61,9 +68,65 @@ accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml \
  2. Install pytorch stable https://pytorch.org/get-started/locally/

  3. Install python dependencies with ONE of the following:
-      - `pip3 install -e .` (recommended, supports QLoRA, no gptq/int4 support)
-      - `pip3 install -e .[gptq]` (next best if you don't need QLoRA, but want to use gptq)
-      - `pip3 install -e .[gptq_triton]`
+      - Recommended, supports QLoRA, NO gptq/int4 support
+        ```bash
+        pip3 install -e .
+        pip3 install -U git+https://github.com/huggingface/peft.git
+        ```
+      - gptq/int4 support, NO QLoRA
+        ```bash
+        pip3 install -e .[gptq]
+        ```
+      - same as above but not recommended
+        ```bash
+        pip3 install -e .[gptq_triton]
+        ```
+
+- LambdaLabs
+  <details>
+
+  <summary>Click to Expand</summary>
+
+  1. Install python
+  ```bash
+  sudo apt update
+  sudo apt install -y python3.9
+
+  sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.9 1
+  sudo update-alternatives --config python # pick 3.9 if given option
+  python -V # should be 3.9
+
+  ```
+
+  2. Install pip
+  ```bash
+  wget https://bootstrap.pypa.io/get-pip.py
+  python get-pip.py
+  ```
+
+  3. Install torch
+  ```bash
+  pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
+  ```
+
+  4. Axolotl
+  ```bash
+  git clone https://github.com/OpenAccess-AI-Collective/axolotl
+  cd axolotl
+
+  pip3 install -e . # change depend on needs
+  pip3 install protobuf==3.20.3
+  pip3 install -U requests
+  pip3 install -U --ignore-installed psutil
+  pip3 install -U scipy
+  pip3 install git+https://github.com/huggingface/peft.git # not for gptq
+  ```
+
+  5. Set path
+  ```bash
+  export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LD_LIBRARY_PATH
+  ```
+  </details>

 ### Dataset

@@ -73,7 +136,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"instruction": "...", "input": "...", "output": "..."}
  ```
- `sharegpt`: conversations
+- `sharegpt:chat`: conversations
  ```json
  {"conversations": [{"from": "...", "value": "..."}]}
  ```
@@ -114,16 +177,73 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"article": "...", "summary": "..."}
  ```
-
-> Have some new format to propose? Check if it's already defined in [data.py](src/axolotl/utils/data.py) in `dev` branch!
+- `alpaca_chat`: basic instruct for alpaca chat
+  ```json
+  {"instruction": "...", "input": "...", "response": "..."}
+  ```
+- `alpaca_chat.load_qa`: question and answer for alpaca chat
+  ```json
+  {"question": "...", "answer": "..."}
+  ```
+- `alpaca_chat.load_concise`: question and answer for alpaca chat, for concise answers
+  ```json
+  {"instruction": "...", "input": "...", "response": "..."}
+  ```
+- `alpaca_chat.load_camel_ai`: question and answer for alpaca chat, for load_camel_ai
+  ```json
+  {"message_1": "...", "message_2": "..."}
+  ```
+- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
+  ```json
+  {"system_prompt": "...", "question": "...", "response": "..."}
+  ```
+- `context_qa`: in context question answering from an article
+  ```json
+  {"article": "...", "question": "...", "answer": "..."}
+  ```
+- `context_qa.load_404`: in context question answering from an article, with default response for no answer from context
+  ```json
+  {"article": "...", "unanswerable_question": "..."}
+  ```
+- `creative_acr.load_answer`: instruction and revision
+  ```json
+  {"instruction": "...", "revision": "..."}
+  ```
+- `creative_acr.load_critique`: critique
+  ```json
+  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
+  ```
+- `creative_acr.load_revise`: critique and revise
+  ```json
+  {"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
+  ```
+- `pygmalion`: pygmalion
+  ```json
+  {"conversations": [{"role": "...", "value": "..."}]}
+  ```
+- `sharegpt_simple.load_role`: conversations where `role` is used instead of `from`
+  ```json
+  {"conversations": [{"role": "...", "value": "..."}]}
+  ```
+- `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
+  ```json
+  {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
+  ```

 </details>

+#### How to add custom prompts
+
+  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
+  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
+
 Optionally, download some datasets, see [data/README.md](data/README.md)

+
+
 ### Config

-See sample configs in [configs](configs) folder or [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:
+See [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:

 - model
  ```yaml
@@ -133,10 +253,24 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic

 - dataset
  ```yaml
+  sequence_len: 2048 # max token length for prompt
+
+  # huggingface repo
  datasets:
-    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
+    - path: vicgalle/alpaca-gpt4
+      type: alpaca # format from earlier
+
+  # huggingface repo with specific configuration/subset
+  datasets:
+    - path: EleutherAI/pile
+      name: enron_emails
+      type: completion # format from earlier
+
+  # local
+  datasets:
+    - path: json
+      data_files: data.jsonl # or json
      type: alpaca # format from earlier
-  sequence_len: 2048 # max token length / prompt
  ```

 - loading
@@ -146,6 +280,8 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
  bf16: true # require >=ampere
  fp16: true
  tf32: true # require >=ampere
+  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
+  float16: true # use instead of fp16 when you don't want AMP
  ```
  Note: Repo does not do 4-bit quantization.

@@ -173,6 +309,8 @@ base_model_ignore_patterns:
 # if the base_model repo on hf hub doesn't include configuration .json files,
 # you can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
+# you can specify to choose a specific model revision from huggingface hub
+model_revision:
 # Optional tokenizer configuration override in case you want to use a different tokenizer
 # than the one defined in the base model
 tokenizer_config:
@@ -182,6 +320,11 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Trust remote code for untrusted source
 trust_remote_code:
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast:
+# resize the model embeddings when new tokens are added to multiples of 32
+# this is reported to improve training speed on some models
+resize_token_embeddings_to_32x:

 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -202,18 +345,21 @@ tf32: true # require >=ampere

 # a list of one or more datasets to finetune the model with
 datasets:
-  # this can be either a hf dataset, or relative path
+  # hf dataset repo | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format OR format:prompt_style (chat/instruct)
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    data_files: # path to source data files
    shards: # number of shards to split data into
+    name: # name of dataset configuration to load

 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
+# push checkpoints to hub
+hub_model_id: # repo path to push finetuned model
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
@@ -272,13 +418,18 @@ num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 logging_steps:
+save_steps:
+eval_steps:
+
+# save model as safetensors (require safetensors package)
+save_safetensors:

 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # don't use this, leads to wonky training (according to someone on the internet)
 group_by_length: false

-# does not work with current implementation of 4-bit LoRA
+# Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 gradient_checkpointing: false

 # stop training after this many evaluation losses have increased in a row
@@ -300,7 +451,15 @@ log_sweep_max_lr:
 optimizer:
 # specify weight decay
 weight_decay:
+# adamw hyperparams
+adam_beta1:
+adam_beta2:
+adam_epsilon:
+# Gradient clipping max norm
+max_grad_norm:

+# whether to bettertransformers
+flash_optimum:
 # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
 # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
@@ -308,6 +467,11 @@ flash_attention:  # require a100 for llama
 # whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:
+# Landmark attention (only llama)
+landmark_attention:
+# xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
+# llama only
+xpos_rope:

 # resume from a specific checkpoint dir
 resume_from_checkpoint:
@@ -351,17 +515,6 @@ strict:

 </details>

-### Accelerate
-
-Configure accelerate
-
-```bash
-accelerate config
-
-# Edit manually
-# nano ~/.cache/huggingface/accelerate/default_config.yaml
-```
-
 ### Train

 Run
@@ -369,17 +522,37 @@ Run
 accelerate launch scripts/finetune.py configs/your_config.yml
 ```

+#### Multi-GPU Config
+
+- llama FSDP
+```yaml
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_offload_params: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+```
+
+- llama Deepspeed: append `ACCELERATE_USE_DEEPSPEED=true` in front of finetune command
+
 ### Inference

 Pass the appropriate flag to the train command:

 - Pretrained LORA:
  ```bash
-  --inference --lora_model_dir ./completed-model
+  --inference --lora_model_dir="./lora-output-dir"
  ```
 - Full weights finetune:
  ```bash
-  --inference --base_model ./completed-model
+  --inference --base_model="./completed-model"
+  ```
+- Full weights finetune w/ a prompt from a text file:
+  ```bash
+  cat /tmp/prompt.txt | python scripts/finetune.py configs/your_config.yml \
+    --base_model="./completed-model" --inference --prompter=None --load_in_8bit=True
  ```

 ### Merge LORA to base
@@ -390,6 +563,12 @@ Add below flag to train command above
 --merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
 ```

+If you run out of CUDA memory, you can try to merge in system RAM with
+
+```bash
+CUDA_VISIBLE_DEVICES="" python3 scripts/finetune.py ...
+```
+
 ## Common Errors 🧰

 > Cuda out of memory
@@ -397,6 +576,7 @@ Add below flag to train command above
 Please reduce any below
  - `micro_batch_size`
  - `eval_batch_size`
+  - `gradient_accumulation_steps`
  - `sequence_len`

 > RuntimeError: expected scalar type Float but found Half
@@ -407,7 +587,11 @@ Try set `fp16: true`

 Try to turn off xformers.

-## Need help? 🙋‍♂️
+> accelerate config missing
+
+It's safe to ignore it.
+
+## Need help? 🙋♂️

 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you

@@ -421,6 +605,16 @@ Building something cool with Axolotl? Consider adding a badge to your model card

 [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)

+## Community Showcase
+
+Open Access AI Collective
+- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b)
+- [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
+- [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)
+
+PocketDoc Labs
+- [Dan's PersonalityEngine 13b LoRA](https://huggingface.co/PocketDoc/Dans-PersonalityEngine-13b-LoRA)
+
 ## Contributing 🤝

 Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).
--- a/configs/accelerate/default_config.yaml
+++ b/configs/accelerate/default_config.yaml
@@ -1,15 +0,0 @@
-compute_environment: LOCAL_MACHINE
-distributed_type: 'NO'
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 1
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
--- a/configs/cerebras_1_3B_alpaca.yml
+++ b/configs/cerebras_1_3B_alpaca.yml
@@ -1,40 +0,0 @@
-base_model: cerebras/Cerebras-GPT-1.3B
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: true
-datasets:
-  - path: data/alpaca_data_gpt4.jsonl
-    type: alpaca
-  - path: data/vicuna_cleaned.jsonl
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-adapter: lora
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - c_attn
-lora_fan_in_fan_out: false
-wandb_project: pythia-1.4b-lora
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 4
-num_epochs: 5
-learning_rate: 0.0003
-train_on_inputs: false
-group_by_length: false
-bf16: True
-tf32: True
-gradient_checkpointing:
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
--- a/configs/galactica_1_3B.yml
+++ b/configs/galactica_1_3B.yml
@@ -1,41 +0,0 @@
-base_model: facebook/galactica-1.3b
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: false
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-adapter:
-lora_model_dir:
-sequence_len: 1024
-max_packed_sequence_len: 1024
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-llama-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-num_epochs: 3
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: false
-tf32: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-tokens:
-  pad_token: "[PAD]"
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/configs/llama_13B_alpaca.yml
+++ b/configs/llama_13B_alpaca.yml
@@ -1,39 +0,0 @@
-base_model: huggyllama/llama-13b
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
-    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
-    type: sharegpt
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.002
-adapter:
-lora_model_dir:
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./llama-13b-sharegpt
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-warmup_steps: 1000
-save_steps:
-eval_steps:
-num_epochs: 5
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience: 5
-resume_from_checkpoint:
-local_rank:
--- a/configs/llama_65B_alpaca.yml
+++ b/configs/llama_65B_alpaca.yml
@@ -1,44 +0,0 @@
-base_model: huggyllama/llama-65b
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: data/alpaca_data_gpt4.jsonl
-    type: alpaca
-  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
-    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: llama-65b-lora
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-llama-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-warmup_steps: 1000
-save_steps:
-num_epochs: 5
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
--- a/configs/llama_7B_4bit.yml
+++ b/configs/llama_7B_4bit.yml
@@ -1,45 +0,0 @@
-base_model: decapoda-research/llama-7b-hf-int4
-base_model_config: decapoda-research/llama-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: tatsu-lab/alpaca  # original alpaca dataset
-    type: alpaca
-dataset_prepared_path: data/last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 1024
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-test
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-num_epochs: 3
-warmup_steps: 100
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-gradient_checkpointing: false
-early_stopping_patience: 3
-resume_from_checkpoint:
-auto_resume_from_checkpoints: true
-local_rank:
-load_4bit: true
-xformers_attention: true
-flash_attention:
--- a/configs/llama_7B_alpaca.yml
+++ b/configs/llama_7B_alpaca.yml
@@ -1,41 +0,0 @@
-base_model: huggyllama/llama-7b
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: data/alpaca_data_gpt4.jsonl
-    type: alpaca
-  - path: data/vicuna_cleaned.jsonl
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: llama-7b-lora
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-llama-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-num_epochs: 5
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
--- a/configs/quickstart.yml
+++ b/configs/quickstart.yml
@@ -1,45 +0,0 @@
-base_model: decapoda-research/llama-7b-hf-int4
-base_model_config: decapoda-research/llama-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: tatsu-lab/alpaca  # original alpaca dataset
-    type: alpaca
-dataset_prepared_path: data/last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 1024
-max_packed_sequence_len: 1024
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-test
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 3
-warmup_steps: 100
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-gradient_checkpointing: false
-early_stopping_patience: 3
-resume_from_checkpoint:
-auto_resume_from_checkpoints: true
-local_rank:
-gptq: true
-xformers_attention: true
-flash_attention:
--- a/configs/sample.yml
+++ b/configs/sample.yml
@@ -1,87 +0,0 @@
-# this is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# this can also be a relative path to a model on disk
-base_model: decapoda-research/llama-7b-hf-int4
-# you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
-base_model_ignore_patterns:
-# if the base_model repo on hf hub doesn't include configuration .json files,
-# you can set that here, or leave this empty to default to base_model
-base_model_config: decapoda-research/llama-7b-hf
-# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
-model_type: AutoModelForCausalLM
-# Corresponding tokenizer for the model AutoTokenizer is a good choice
-tokenizer_type: AutoTokenizer
-# whether you are training a 4-bit quantized model
-load_4bit: true
-# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: true
-# a list of one or more datasets to finetune the model with
-datasets:
-  # this can be either a hf dataset, or relative path
-  - path: vicgalle/alpaca-gpt4
-  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca
-# axolotl attempts to save the dataset as an arrow after packing the data together so
-# subsequent training attempts load faster, relative path
-dataset_prepared_path: data/last_run_prepared
-# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc
-val_set_size: 0.04
-# if you want to use lora, leave blank to train all parameters in original model
-adapter: lora
-# if you already have a lora model trained that you want to load, put that here
-lora_model_dir:
-# the maximum length of an input to train with, this should typically be less than 2048
-# as most models have a token/context limit of 2048
-sequence_len: 2048
-# max sequence length to concatenate training samples together up to
-# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
-max_packed_sequence_len: 1024
-# lora hyperparameters
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-# wandb configuration if your're using it
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-# where to save the finsihed model to
-output_dir: ./completed-model
-# training hyperparameters
-gradient_accumulation_steps: 1
-batch_size:
-micro_batch_size: 2
-num_epochs: 3
-warmup_steps: 100
-learning_rate: 0.00003
-# whether to mask out or include the human's prompt from the training labels
-train_on_inputs: false
-# don't use this, leads to wonky training (according to someone on the internet)
-group_by_length: false
-# Use CUDA bf16
-bf16: true
-# Use CUDA tf32
-tf32: true
-# does not work with current implementation of 4-bit LoRA
-gradient_checkpointing: false
-# stop training after this many evaluation losses have increased in a row
-# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-early_stopping_patience: 3
-# specify a scheduler to use with the optimizer. only one_cycle is supported currently
-lr_scheduler:
-# whether to use xformers attention patch https://github.com/facebookresearch/xformers:
-xformers_attention:
-# whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
-flash_attention:
-# resume from a specific checkpoint dir
-resume_from_checkpoint:
-# if resume_from_checkpoint isn't set and you simply want it to start where it left off
-# be careful with this being turned on between different models
-auto_resume_from_checkpoints: false
-# don't mess with this, it's here for accelerate and torchrun
-local_rank:
--- a/configs/stability_3b.yml
+++ b/configs/stability_3b.yml
@@ -1,56 +0,0 @@
-base_model: stabilityai/stablelm-base-alpha-3b
-base_model_config: stabilityai/stablelm-base-alpha-3b
-load_in_8bit: false
-datasets:
-  - path: vicgalle/alpaca-gpt4
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.04
-adapter:
-lora_model_dir:
-sequence_len: 4096
-max_packed_sequence_len: 4096
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: stable-alpaca-3b
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./stable-alpaca-3b
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-torchdistx_path:
-lr_scheduler: cosine
-learning_rate: 0.0000002
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention: true
-flash_attention:
-gptq_groupsize:
-gptq_model_v1:
-warmup_steps: 100
-eval_steps: 50
-save_steps: 200
-debug:
-deepspeed:
-weight_decay: 0.01
-fsdp:
-fsdp_config:
-#tokens:
-#  pad_token: "[PAD]"
-#  bos_token: "<s>"
-#  eos_token: "</s>"
-#  unk_token: "<unk>"
--- a/configs/vicuna_13B_4bit_reflect.yml
+++ b/configs/vicuna_13B_4bit_reflect.yml
@@ -1,45 +0,0 @@
-base_model: anon8231489123/vicuna-13b-GPTQ-4bit-128g
-base_model_config: anon8231489123/vicuna-13b-GPTQ-4bit-128g
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: false
-load_4bit: true
-gptq_groupsize: 128
-gptq_model_v1: false
-datasets:
-# https://github.com/vaguenebula/AlpacaDataReflect/blob/main/alpaca_reflect_pruned.json
-  - path: data/alpaca_reflect_pruned.jsonl
-    type: reflection
-dataset_prepared_path: data/last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-reflect
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-num_epochs: 3
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-gradient_checkpointing: false
-early_stopping_patience: 3
-resume_from_checkpoint:
-local_rank:
-flash_attention: true
--- a/data/README.md
+++ b/data/README.md
@@ -10,10 +10,10 @@ curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarit
 ## Convert the JSON data files to JSONL.

 ```shell
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/alpaca_data_gpt4.json > data/alpaca_data_gpt4.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/vicuna_cleaned.json > data/vicuna_cleaned.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/roleplay-similarity_0.6-instruct-dataset.json > data/roleplay-similarity_0.6-instruct-dataset.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/gpt4-instruct-similarity-0.6-dataset.json > data/gpt4-instruct-similarity-0.6-dataset.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl
 ```
 ---

--- a/docker-compose.yaml
+++ b/docker-compose.yaml
@@ -0,0 +1,20 @@
+# version: '3.8'
+services:
+  axolotl:
+    build:
+      context: .
+      dockerfile: ./docker/Dockerfile
+    volumes:
+      - .:/workspace/axolotl
+      - ~/.cache/huggingface/:/root/.cache/huggingface/
+    # set environment variables
+    environment:
+      - WANDB_API_KEY=${WANDB_API_KEY}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              # count: 1
+              capabilities: [gpu]
+    command: tail -f /dev/null
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -3,18 +3,16 @@ FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG CUDA="118"
+ENV BNB_CUDA_VERSION=$CUDA

 RUN apt-get update && \
    apt-get install -y vim curl

 WORKDIR /workspace

-RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main"
-
-RUN mkdir axolotl
-COPY . axolotl/
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
    if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
@@ -23,5 +21,10 @@ RUN cd axolotl && \
        pip install -e .; \
    fi

+# fix so that git fetch/pull from remote works
+RUN cd axolotl && \
+    git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
 # helper for huggingface-login cli
 RUN git config --global credential.helper store
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -8,7 +8,7 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION a
 ENV PATH="/root/miniconda3/bin:${PATH}"

 ARG PYTHON_VERSION="3.9"
-ARG PYTORCH="2.0.0"
+ARG PYTORCH_VERSION="2.0.1"
 ARG CUDA="118"

 ENV PYTHON_VERSION=$PYTHON_VERSION
@@ -29,17 +29,18 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH} torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA


 FROM base-builder AS flash-attn-builder

 WORKDIR /workspace

-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

-RUN git clone https://github.com/HazyResearch/flash-attention.git && \
+RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
    cd flash-attention && \
+    git checkout v2.0.1  && \
    python3 setup.py bdist_wheel && \
    cd csrc/fused_dense_lib && \
    python3 setup.py bdist_wheel && \
@@ -52,7 +53,7 @@ RUN git clone https://github.com/HazyResearch/flash-attention.git && \

 FROM base-builder AS deepspeed-builder

-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

 WORKDIR /workspace

@@ -73,11 +74,14 @@ RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \

 FROM base-builder

+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
 # recompile apex
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
+RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./

 RUN mkdir -p /workspace/builds
 COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
@@ -97,4 +101,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic
+    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/docker/Dockerfile-runpod
+++ b/docker/Dockerfile-runpod
@@ -1,6 +1,10 @@
 ARG BASE_TAG=main
 FROM winglian/axolotl:$BASE_TAG

+ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
+
 COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh

 RUN apt install --yes --no-install-recommends openssh-server tmux && \
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -0,0 +1,60 @@
+base_model: cerebras/Cerebras-GPT-1.3B
+base_model_config: cerebras/Cerebras-GPT-1.3B
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+adapter: qlora
+lora_model_dir:
+sequence_len: 2048
+max_packed_sequence_len: 2048
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+  - c_fc
+  - c_attn
+  - c_proj
+lora_target_linear:
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./qlora-out
+batch_size: 4
+micro_batch_size: 4
+num_epochs: 2
+optimizer: paged_adamw_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: true
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|endoftext|>"
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -23,7 +23,7 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project: falcon-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -0,0 +1,92 @@
+# 1b: tiiuae/falcon-rw-1b
+# 40b: tiiuae/falcon-40b
+base_model: tiiuae/falcon-7b
+base_model_config: tiiuae/falcon-7b
+# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
+trust_remote_code: true
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+load_in_8bit: false
+# enable 4bit for QLoRA
+load_in_4bit: true
+gptq: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: QingyiSi/Alpaca-CoT
+    data_files:
+      - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
+    type: "alpaca:chat"
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+# enable QLoRA
+adapter: qlora
+lora_model_dir:
+sequence_len: 2048
+max_packed_sequence_len:
+
+# hyperparameters from QLoRA paper Appendix B.2
+# "We find hyperparameters to be largely robust across datasets"
+lora_r: 64
+lora_alpha: 16
+# 0.1 for models up to 13B
+# 0.05 for 33B and 65B models
+lora_dropout: 0.05
+# add LoRA modules on all linear layers of the base model
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./qlora-out
+
+# QLoRA paper Table 9
+# - 16 for 7b & 13b
+# - 32 for 33b, 64 for 64b
+# Max size tested on A6000
+# - 7b: 40
+# - 40b: 4
+# decrease if OOM, increase for max VRAM utilization
+micro_batch_size: 1
+gradient_accumulation_steps: 2
+num_epochs: 3
+# Optimizer for QLoRA
+optimizer: paged_adamw_32bit
+torchdistx_path:
+lr_scheduler: cosine
+# QLoRA paper Table 9
+# - 2e-4 for 7b & 13b
+# - 1e-4 for 33b & 64b
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: true
+gradient_checkpointing: true
+# stop training after this many evaluation losses have increased in a row
+# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+early_stopping_patience: 3
+resume_from_checkpoint:
+auto_resume_from_checkpoints: true
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 5
+save_steps: 10
+debug:
+deepspeed:
+weight_decay: 0.000001
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|endoftext|>"
+  bos_token: ">>ABSTRACT<<"
+  eos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -23,7 +23,7 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project: falcon-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -0,0 +1,57 @@
+base_model: EleutherAI/gpt-j-6b
+base_model_config: EleutherAI/gpt-j-6b
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+adapter: qlora
+lora_model_dir:
+sequence_len: 2048
+max_packed_sequence_len:
+lora_r: 8
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./qlora-out
+gradient_accumulation_steps: 2
+micro_batch_size: 2
+num_epochs: 2
+optimizer: paged_adamw_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0001
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: true
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|endoftext|>"
--- a/examples/gptq-lora-7b/README.md
+++ b/examples/gptq-lora-7b/README.md
@@ -3,6 +3,6 @@
 This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.

 ```shell
-accelerate launch scripts/finetune.py examples/4bit-lora-7b/config.yml
+accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml

 ```
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -7,30 +7,28 @@ datasets:
  - path: openaccess-ai-collective/jeopardy
    type: jeopardy
 dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
+val_set_size: 0.02
 adapter:
 lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
+sequence_len: 512
+max_packed_sequence_len:
+lora_r:
+lora_alpha:
+lora_dropout:
 lora_target_modules:
-  - q_proj
-  - v_proj
 lora_fan_in_fan_out: false
-wandb_project: jeopardy-bot-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./jeopardy-bot-7b
-gradient_accumulation_steps: 2
+gradient_accumulation_steps: 1
 micro_batch_size: 1
-num_epochs: 2
+num_epochs: 3
 optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0000002
+learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
@@ -48,11 +46,10 @@ eval_steps: 110
 save_steps: 660
 debug:
 deepspeed:
-weight_decay: 0.0001
+weight_decay: 0.1
 fsdp:
 fsdp_config:
 tokens:
-  pad_token: "[PAD]"
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/examples/llama-2/README.md
+++ b/examples/llama-2/README.md
@@ -0,0 +1,20 @@
+# Overview
+
+This is an example of a llama-2 configuration for 7b and 13b. The yaml file contains configuration for the 7b variant, but you can just aswell use the same settings for 13b.
+
+The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
+
+The 13b variant will fit if you change these settings to these values:
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+
+```shell
+accelerate launch scripts/finetune.py examples/llama-2/qlora.yml
+
+```
+or
+
+```shell
+accelerate launch scripts/finetune.py examples/llama-2/lora.yml
+
+```
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -0,0 +1,66 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 4096
+max_packed_sequence_len: 4096
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+  pad_token: "<pad>"
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -0,0 +1,67 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 4096
+max_packed_sequence_len: 4096
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+  pad_token: "<pad>"
--- a/examples/openllama-3b/README.md
+++ b/examples/openllama-3b/README.md
@@ -0,0 +1,16 @@
+# openllama-3b
+
+Basic full tune
+```shell
+accelerate launch scripts/finetune.py examples/openllama-3b/config.yml
+```
+
+LoRA
+```shell
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml
+```
+
+QLoRA
+```shell
+accelerate launch scripts/finetune.py examples/openllama-3b/qlora.yml
+```
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -0,0 +1,62 @@
+base_model: openlm-research/open_llama_3b
+base_model_config: openlm-research/open_llama_3b
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.02
+adapter:
+lora_model_dir:
+sequence_len: 256
+max_packed_sequence_len:
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_modules:
+lora_target_linear:
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./openllama-out
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.00001
+train_on_inputs: false
+group_by_length: false
+float16: true
+bf16: false
+fp16: false
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 50
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/lora-openllama-3b/config.yml
+++ b/examples/lora-openllama-3b/config.yml
@@ -1,5 +1,5 @@
-base_model: openlm-research/open_llama_3b_600bt_preview
-base_model_config: openlm-research/open_llama_3b_600bt_preview
+base_model: openlm-research/open_llama_3b
+base_model_config: openlm-research/open_llama_3b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
@@ -49,7 +49,7 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention:
+xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
--- a/examples/qlora-openllama-3b/config.yml
+++ b/examples/qlora-openllama-3b/config.yml
@@ -1,5 +1,5 @@
-base_model: openlm-research/open_llama_3b_600bt_preview
-base_model_config: openlm-research/open_llama_3b_600bt_preview
+base_model: openlm-research/open_llama_3b
+base_model_config: openlm-research/open_llama_3b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false
--- a/examples/pythia-12b/README.md
+++ b/examples/pythia-12b/README.md
@@ -0,0 +1,9 @@
+# Pythia 12B
+
+- Single-GPU A100 only (?)
+
+```shell
+python scripts/finetune.py examples/pythia-12b/config.yml
+```
+
+⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -1,39 +1,49 @@
-base_model: EleutherAI/gpt-neox-20b
+base_model: EleutherAI/pythia-12b-deduped
+base_model_config: EleutherAI/pythia-12b-deduped
 base_model_ignore_patterns: pytorch*  # prefer safetensors
 model_type: GPTNeoXForCausalLM
 tokenizer_type: AutoTokenizer
-load_in_8bit: true
+load_in_8bit: false
+load_in_4bit: false
+gptq: false
+device_map: auto
 datasets:
-  - path: nomic-ai/gpt4all-j-prompt-generations
+  - path: vicgalle/alpaca-gpt4
    type: alpaca
-    shards: 4
-    shards_index: 0
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-adapter: lora
+adapter:
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 2048
-lora_r: 8
+lora_r: 64
 lora_alpha: 32
-lora_dropout: 0.05
+lora_dropout: 0.0
 lora_target_modules:
-  - query_key_value
+lora_target_linear: true
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: gpt4all-neox-20b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
-output_dir: ./gpt4all-neox-20b
+output_dir: ./pythia-12b
 gradient_accumulation_steps: 1
-micro_batch_size: 4
+micro_batch_size: 1
 num_epochs: 5
 learning_rate: 0.00003
-lr_scheduler: one_cycle
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
 train_on_inputs: false
 group_by_length: false
-bf16: True
-tf32: True
+bf16: false
+fp16: false
+float16: true
+tf32: true
+flash_optimum: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
+gradient_checkpointing: true
+fsdp:
+fsdp_config:
+collator_pad_to_longest: true
--- a/configs/pythia_1_2B_alpaca.yml
+++ b/configs/pythia_1_2B_alpaca.yml
@@ -1,36 +1,29 @@
 base_model: EleutherAI/pythia-1.4b-deduped
-model_type: GPTNeoXForCausalLM
-tokenizer_type: AutoTokenizer
+base_model_config: EleutherAI/pythia-1.4b-deduped
 load_in_8bit: true
 datasets:
-  - path: data/alpaca_data_gpt4.jsonl
+  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
-  - path: data/vicuna_cleaned.jsonl
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 adapter: lora
 lora_model_dir:
-sequence_len: 2048
-lora_r: 8
+sequence_len: 512
+lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - query_key_value
-#  - xxx
+lora_target_linear:
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: pythia-1.4b-lora
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
-output_dir: ./lora-alpaca
+output_dir: ./lora-alpaca-pythia
 gradient_accumulation_steps: 1
 micro_batch_size: 4
-num_epochs: 5
+num_epochs: 3
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
@@ -39,3 +32,6 @@ tf32: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
+weight_decay: 0.1
+eval_steps: 20
+logging_steps: 1
--- a/examples/qlora-openllama-3b/README.md
+++ b/examples/qlora-openllama-3b/README.md
@@ -1,6 +0,0 @@
-# qlora-openllama-3b
-
-```shell
-accelerate launch scripts/finetune.py examples/qlora-openllama-3b/config.yml
-
-```
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -1,7 +1,7 @@
 base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
 base_model_config: togethercomputer/RedPajama-INCITE-Chat-3B-v1
 model_type: GPTNeoXForCausalLM
-tokenizer_type: GPTNeoXTokenizer
+tokenizer_type: AutoTokenizer
 trust_remote_code:
 load_in_8bit: false
 datasets:
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -0,0 +1,90 @@
+# An example finetuning Saleforce's XGen-7b model with 8k context using qlora
+# on Tim Dettmer's Guanaco dataset.
+base_model: Salesforce/xgen-7b-8k-base
+base_model_config: Salesforce/xgen-7b-8k-base
+trust_remote_code: true
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+load_in_8bit: false
+# enable 4bit for QLoRA
+load_in_4bit: true
+gptq: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: timdettmers/openassistant-guanaco
+    data_files:
+      - openassistant_best_replies_train.jsonl
+    type: "completion"
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+# enable QLoRA
+adapter: qlora
+lora_model_dir:
+sequence_len: 8192
+max_packed_sequence_len:
+
+# hyperparameters from QLoRA paper Appendix B.2
+# "We find hyperparameters to be largely robust across datasets"
+lora_r: 64
+lora_alpha: 16
+# 0.1 for models up to 13B
+# 0.05 for 33B and 65B models
+lora_dropout: 0.05
+# add LoRA modules on all linear layers of the base model
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./qlora-out
+
+# QLoRA paper Table 9
+# - 16 for 7b & 13b
+# - 32 for 33b, 64 for 64b
+# Max size tested on A6000
+# - 7b: 40
+# - 40b: 4
+# decrease if OOM, increase for max VRAM utilization
+micro_batch_size: 1
+gradient_accumulation_steps: 1
+num_epochs: 3
+# Optimizer for QLoRA
+optimizer: paged_adamw_32bit
+torchdistx_path:
+lr_scheduler: cosine
+# QLoRA paper Table 9
+# - 2e-4 for 7b & 13b
+# - 1e-4 for 33b & 64b
+learning_rate: 0.00002
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+gradient_checkpointing: true
+# stop training after this many evaluation losses have increased in a row
+# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+early_stopping_patience: 3
+resume_from_checkpoint:
+auto_resume_from_checkpoints: true
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 50
+save_steps: 50
+debug:
+deepspeed:
+weight_decay: 0.0
+special_tokens:
+  eos_token: "<|endoftext|>"
+  bos_token: "<|endoftext|>"
+  unk_token: "<|endoftext|>"
+  pad_token: "<|endoftext|>"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.39.0
-accelerate
+accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
 addict
 fire
 PyYAML==6.0
@@ -11,6 +11,8 @@ sentencepiece
 wandb
 einops
 xformers
+optimum
+hf_transfer
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
--- a/scripts/alpaca_json_to_jsonl.py
+++ b/scripts/alpaca_json_to_jsonl.py
@@ -15,6 +15,9 @@ from axolotl.convert import (
    JsonToJsonlConverter,
    StdoutWriter,
 )
+from axolotl.logging_config import configure_logging
+
+configure_logging()

 # add src to the pythonpath so we don't need to pip install this
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -12,13 +12,15 @@ from typing import Any, Dict, List, Optional, Union
 import fire
 import torch
 import yaml
-from transformers import GenerationConfig
-
-from axolotl.utils.data import load_prepare_datasets
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import load_model, load_tokenizer

 # add src to the pythonpath so we don't need to pip install this
+from optimum.bettertransformer import BetterTransformer
+from transformers import GenerationConfig, TextStreamer
+
+from axolotl.logging_config import configure_logging
+from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_model, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.validation import validate_config
@@ -28,9 +30,12 @@ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)

+configure_logging()
+LOG = logging.getLogger("axolotl.scripts")
+

-logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


 def choose_device(cfg):
@@ -47,10 +52,11 @@ def choose_device(cfg):
            return "cpu"

    cfg.device = get_device()
-    if cfg.device == "cuda":
-        cfg.device_map = {"": cfg.local_rank}
-    else:
-        cfg.device_map = {"": cfg.device}
+    if cfg.device_map != "auto":
+        if cfg.device.startswith("cuda"):
+            cfg.device_map = {"": cfg.local_rank}
+        else:
+            cfg.device_map = {"": cfg.device}


 def get_multi_line_input() -> Optional[str]:
@@ -62,23 +68,43 @@ def get_multi_line_input() -> Optional[str]:
    return instruction


-def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
-    tokenizer.add_special_tokens({"unk_token": "<unk>"})
-    tokenizer.add_special_tokens({"bos_token": "<s>"})
-    tokenizer.add_special_tokens({"eos_token": "</s>"})
+def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
+    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}

-    prompter_module = getattr(importlib.import_module("axolotl.prompters"), prompter)
+    for token, symbol in default_tokens.items():
+        # If the token isn't already specified in the config, add it
+        if not (cfg.special_tokens and token in cfg.special_tokens):
+            tokenizer.add_special_tokens({token: symbol})
+
+    prompter_module = None
+    if prompter:
+        prompter_module = getattr(
+            importlib.import_module("axolotl.prompters"), prompter
+        )
+
+    if cfg.landmark_attention:
+        from axolotl.monkeypatch.llama_landmark_attn import set_model_mem_id
+
+        set_model_mem_id(model, tokenizer)
+        model.set_mem_cache_args(
+            max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
+        )

    while True:
+        print("=" * 80)
        # support for multiline inputs
        instruction = get_multi_line_input()
        if not instruction:
            return
-        prompt: str = next(
-            prompter_module().build_prompt(instruction=instruction.strip("\n"))
-        )
+        if prompter_module:
+            prompt: str = next(
+                prompter_module().build_prompt(instruction=instruction.strip("\n"))
+            )
+        else:
+            prompt = instruction.strip()
        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

+        print("=" * 40)
        model.eval()
        with torch.no_grad():
            generation_config = GenerationConfig(
@@ -97,10 +123,13 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
                output_hidden_states=False,
                output_scores=False,
            )
+            streamer = TextStreamer(tokenizer)
            generated = model.generate(
                inputs=batch["input_ids"].to(cfg.device),
                generation_config=generation_config,
+                streamer=streamer,
            )
+        print("=" * 40)
        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))


@@ -150,7 +179,7 @@ def train(
    cfg_keys = cfg.keys()
    for k, _ in kwargs.items():
        # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or cfg.strict is False:
+        if k in cfg_keys or not cfg.strict:
            # handle booleans
            if isinstance(cfg[k], bool):
                cfg[k] = bool(kwargs[k])
@@ -182,20 +211,34 @@ def train(
            cfg.fp16 = True
        cfg.bf16 = False

+    if cfg.tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+
    # load the tokenizer first
    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
-    logging.info(f"loading tokenizer... {tokenizer_config}")
+    LOG.info(f"loading tokenizer... {tokenizer_config}")
    tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)

-    if check_not_in(
-        ["inference", "shard", "merge_lora"], kwargs
+    if (
+        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
    ):  # don't need to load dataset for these
-        train_dataset, eval_dataset = load_prepare_datasets(
-            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
-        )
+        if not cfg.pretraining_dataset:
+            train_dataset, eval_dataset = load_prepare_datasets(
+                tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
+            )
+        else:
+            train_dataset = load_pretraining_dataset(
+                cfg.pretraining_dataset,
+                tokenizer,
+                max_tokens=cfg.sequence_len,
+                seed=cfg.seed,
+            )
+            # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
+            train_dataset = train_dataset.with_format("torch")
+            eval_dataset = None

    if cfg.debug or "debug" in kwargs:
-        logging.info("check_dataset_labels...")
+        LOG.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
@@ -204,11 +247,11 @@ def train(
        )

    if prepare_ds_only:
-        logging.info("Finished preparing dataset. Exiting...")
+        LOG.info("Finished preparing dataset. Exiting...")
        return

    # Load the model and tokenizer
-    logging.info("loading model and peft_config...")
+    LOG.info("loading model and peft_config...")
    model, peft_config = load_model(
        cfg.base_model,
        cfg.base_model_config,
@@ -216,22 +259,27 @@ def train(
        tokenizer,
        cfg,
        adapter=cfg.adapter,
-        inference=("inference" in kwargs),
    )

    if "merge_lora" in kwargs and cfg.adapter is not None:
-        logging.info("running merge of LoRA with base model")
+        LOG.info("running merge of LoRA with base model")
        model = model.merge_and_unload()
        model.to(dtype=torch.float16)

        if cfg.local_rank == 0:
-            logging.info("saving merged model")
+            LOG.info("saving merged model")
            model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
        return

-    if "inference" in kwargs:
-        logging.info("calling do_inference function")
-        do_inference(cfg, model, tokenizer)
+    if cfg.inference:
+        LOG.info("calling do_inference function")
+        prompter: Optional[str] = "AlpacaPrompter"
+        if "prompter" in kwargs:
+            if kwargs["prompter"] == "None":
+                prompter = None
+            else:
+                prompter = kwargs["prompter"]
+        do_inference(cfg, model, tokenizer, prompter=prompter)
        return

    if "shard" in kwargs:
@@ -243,27 +291,30 @@ def train(
    model.config.use_cache = False

    if torch.__version__ >= "2" and sys.platform != "win32":
-        logging.info("Compiling torch model")
+        LOG.info("Compiling torch model")
        model = torch.compile(model)

    # go ahead and presave, so we have the adapter config available to inspect
    if peft_config:
-        logging.info(f"Pre-saving adapter config to {cfg.output_dir}")
+        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
        peft_config.save_pretrained(cfg.output_dir)

    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
    if cfg.local_rank == 0:
+
+        def terminate_handler(_, __, model):
+            if cfg.flash_optimum:
+                model = BetterTransformer.reverse(model)
+            model.save_pretrained(cfg.output_dir)
+            sys.exit(0)
+
        signal.signal(
-            signal.SIGINT,
-            lambda signal, frame: (
-                model.save_pretrained(cfg.output_dir),
-                sys.exit(0),
-            ),
+            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
        )

-    logging.info("Starting trainer...")
+    LOG.info("Starting trainer...")
    if cfg.group_by_length:
-        logging.info("hang tight... sorting dataset for group_by_length")
+        LOG.info("hang tight... sorting dataset for group_by_length")
    resume_from_checkpoint = cfg.resume_from_checkpoint
    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
        possible_checkpoints = [
@@ -275,16 +326,29 @@ def train(
                key=lambda path: int(path.split("-")[-1]),
            )
            resume_from_checkpoint = sorted_paths[-1]
-            logging.info(
+            LOG.info(
                f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
            )
-    trainer.train(resume_from_checkpoint=resume_from_checkpoint)

-    logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
+    if not Path(cfg.output_dir).is_dir():
+        os.makedirs(cfg.output_dir, exist_ok=True)
+    if cfg.flash_optimum:
+        with torch.backends.cuda.sdp_kernel(
+            enable_flash=True, enable_math=True, enable_mem_efficient=True
+        ):
+            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    else:
+        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+
+    LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
-    if cfg.local_rank == 0:
+    if cfg.fsdp:
+        model.save_pretrained(cfg.output_dir)
+    elif cfg.local_rank == 0:
+        if cfg.flash_optimum:
+            model = BetterTransformer.reverse(model)
        model.save_pretrained(cfg.output_dir)

    # trainer.save_model(cfg.output_dir)  # TODO this may be needed for deepspeed to work? need to review another time
--- a/scripts/runpod-entrypoint.sh
+++ b/scripts/runpod-entrypoint.sh
@@ -1,10 +1,21 @@
 #!/bin/bash

-echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
-chmod 700 -R ~/.ssh
+# Export specific ENV variables to /etc/rp_environment
+echo "Exporting environment variables..."
+printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
+echo 'source /etc/rp_environment' >> ~/.bashrc

-# Start the SSH service in the background
-service ssh start
+if [[ $PUBLIC_KEY ]]
+then
+    mkdir -p ~/.ssh
+    chmod 700 ~/.ssh
+    echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
+    chmod 700 -R ~/.ssh
+    # Start the SSH service in the background
+    service ssh start
+else
+    echo "No PUBLIC_KEY ENV variable provided, not starting openSSH daemon"
+fi

 # Execute the passed arguments (CMD)
 exec "$@"
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,12 +1,13 @@
 """Module containing Dataset functionality"""

 import logging
+import os
 from typing import List

 import torch
 from datasets import IterableDataset

-from .prompt_tokenizers import InvalidDataException, PromptTokenizingStrategy
+from .prompt_tokenizers import PromptTokenizingStrategy

 # We want this to be a wrapper for an existing dataset that we have loaded
 # lets use the concept of middlewares to wrap each dataset, for example
@@ -14,6 +15,8 @@ from .prompt_tokenizers import InvalidDataException, PromptTokenizingStrategy
 # let's check to ensure we don't truncate an item in the middle, we'll use
 # the collators later on to pad the datasets

+LOG = logging.getLogger("axolotl")
+

 class TokenizedPromptDataset(IterableDataset):
    """
@@ -32,13 +35,15 @@ class TokenizedPromptDataset(IterableDataset):
        self.dataset = dataset

    def __iter__(self):
-        iterator = iter(self.dataset)
-        # Loop through the entire dataset
-        for example in iterator:
-            try:
-                yield self.prompt_tokenizer.tokenize_prompt(example)
-            except InvalidDataException:
-                pass
+        features = self.dataset.features.keys()
+        num_proc = os.cpu_count()
+        return iter(
+            self.dataset.map(
+                self.prompt_tokenizer.tokenize_prompt,
+                num_proc=num_proc,
+                remove_columns=features,
+            )
+        )


 # TODO this isn't the best since it can't interleave datasets
@@ -111,7 +116,7 @@ class ConstantLengthDataset(IterableDataset):
                                "attention_mask": attention_mask,
                            }
                        else:
-                            logging.warning(
+                            LOG.warning(
                                f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
                            )
                    buffer = {
@@ -122,6 +127,7 @@ class ConstantLengthDataset(IterableDataset):
                    buffer_len = 0

                if example:
+                    # FIXME
                    # just going to drop data points that are too long
                    if len(example["input_ids"]) <= self.seq_length:
                        input_ids = example["input_ids"]
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -0,0 +1,33 @@
+"""Logging configuration settings"""
+
+import os
+import sys
+from logging.config import dictConfig
+from typing import Any, Dict
+
+DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
+    "version": 1,
+    "formatters": {
+        "simple": {
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
+        },
+    },
+    "filters": {},
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "formatter": "simple",
+            "filters": [],
+            "stream": sys.stdout,
+        },
+    },
+    "root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
+    "loggers": {
+        "axolotl": {"handlers": ["console"], "level": "DEBUG", "propagate": False},
+    },
+}
+
+
+def configure_logging():
+    """Configure with default logging"""
+    dictConfig(DEFAULT_LOGGING_CONFIG)
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -8,7 +8,7 @@ import torch
 import transformers
 from einops import rearrange
 from flash_attn.bert_padding import pad_input, unpad_input
-from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb


@@ -79,7 +79,7 @@ def forward(
            dtype=torch.int32,
            device=qkv.device,
        )
-        output = flash_attn_unpadded_qkvpacked_func(
+        output = flash_attn_varlen_qkvpacked_func(
            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
        )
        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
@@ -95,7 +95,7 @@ def forward(
            three=3,
            h=nheads,
        )
-        output_unpad = flash_attn_unpadded_qkvpacked_func(
+        output_unpad = flash_attn_varlen_qkvpacked_func(
            x_unpad,
            cu_q_lens,
            max_s,
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -7,6 +7,7 @@ import math
 from typing import Optional, Tuple

 import torch
+import torch.nn.functional as F
 import transformers.models.llama.modeling_llama
 from torch import nn

@@ -38,21 +39,48 @@ def xformers_forward(
    # pylint: disable=duplicate-code
    bsz, q_len, _ = hidden_states.size()

-    query_states = (
-        self.q_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    key_states = (
-        self.k_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    value_states = (
-        self.v_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
+    if not hasattr(self, "pretraining_tp"):
+        self.pretraining_tp = 1
+
+    if self.pretraining_tp > 1:
+        key_value_slicing = (
+            self.num_key_value_heads * self.head_dim
+        ) // self.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+        query_states = [
+            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        query_states = torch.cat(query_states, dim=-1)
+
+        key_states = [
+            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        key_states = torch.cat(key_states, dim=-1)
+
+        value_states = [
+            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        value_states = torch.cat(value_states, dim=-1)
+
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
@@ -73,6 +101,14 @@ def xformers_forward(

    past_key_value = (key_states, value_states) if use_cache else None

+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = transformers.models.llama.modeling_llama.repeat_kv(
+        key_states, self.num_key_value_groups
+    )
+    value_states = transformers.models.llama.modeling_llama.repeat_kv(
+        value_states, self.num_key_value_groups
+    )
+
    # We only apply xformers optimizations if we don't need to output the whole attention matrix
    if not output_attentions:
        query_states = query_states.transpose(1, 2)
@@ -128,10 +164,23 @@ def xformers_forward(
                f" {attn_output.size()}"
            )

-        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        # end x-formers vs. not x-formers if-else block

    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-    attn_output = self.o_proj(attn_output)
+
+    if self.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(
+            self.hidden_size // self.pretraining_tp, dim=1
+        )
+        attn_output = sum(
+            F.linear(attn_output[i], o_proj_slices[i])
+            for i in range(self.pretraining_tp)
+        )
+    else:
+        attn_output = self.o_proj(attn_output)
+
    return attn_output, attn_weights, past_key_value


@@ -184,14 +233,15 @@ def sdp_attention_forward(

    # We only apply sdp attention if we don't need to output the whole attention matrix
    if not output_attentions:
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            is_causal=False,
-        )
-        attn_weights = None
+        with torch.backends.cuda.sdp_kernel():
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=False,
+            )
+            attn_weights = None
    else:
        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
--- a/src/axolotl/monkeypatch/llama_landmark_attn.py
+++ b/src/axolotl/monkeypatch/llama_landmark_attn.py
--- a/src/axolotl/monkeypatch/xpos_rope_llama_monkey_patch.py
+++ b/src/axolotl/monkeypatch/xpos_rope_llama_monkey_patch.py
@@ -0,0 +1,94 @@
+# pylint: skip-file
+"""
+Copied from https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
+"""
+import torch
+import transformers
+import transformers.models.llama.modeling_llama
+from einops import rearrange
+
+
+class XposRotaryEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        dim,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scale_base=2048,
+        use_xpos=True,
+    ):
+        super().__init__()
+        self.max_seq_len_cached = max_position_embeddings
+        self.scale_base = scale_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        t = torch.arange(self.max_seq_len_cached, device=device).type_as(inv_freq)
+        freqs = torch.einsum("i , j -> i j", t, inv_freq)
+        freqs = torch.cat((freqs, freqs), dim=-1)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.register_buffer("freqs_cached", freqs, persistent=False)
+
+        if not use_xpos:
+            self.register_buffer("scale", None)
+            self.register_buffer("scale_cached", torch.ones(1))
+            return
+
+        scale = (torch.arange(0, dim, 2) + 0.4 * dim) / (1.4 * dim)
+        power = (t - (self.max_seq_len_cached // 2)) / self.scale_base
+        scale_cached = scale ** rearrange(power, "n -> n 1")
+        scale_cached = torch.cat((scale_cached, scale_cached), dim=-1)
+
+        self.register_buffer("scale", scale, persistent=False)
+        self.register_buffer("scale_cached", scale_cached, persistent=False)
+
+    def forward(
+        self,
+        x,
+        seq_len,
+    ):
+        if seq_len > self.max_seq_len_cached:
+            self.max_seq_len_cached = seq_len
+            t = torch.arange(self.max_seq_len_cached, device=x.device).type_as(
+                self.inv_freq
+            )
+            freqs = torch.einsum("i , j -> i j", t, self.inv_freq)
+            freqs = torch.cat((freqs, freqs), dim=-1).to(dtype=x.dtype)
+
+            self.register_buffer("freqs_cached", freqs)
+
+            if self.scale is None:
+                self.register_buffer(
+                    "scale_cached", torch.ones(1, device=x.device).to(dtype=x.dtype)
+                )
+
+                return self.freqs_cached.to(dtype=x.dtype), self.scale_cached
+
+            power = (t - (seq_len // 2)) / self.scale_base
+            scale = self.scale ** rearrange(power, "n -> n 1")
+            scale = torch.cat((scale, scale), dim=-1).to(dtype=x.dtype)
+            self.register_buffer("scale_cached", scale)
+
+        return self.freqs_cached.to(dtype=x.dtype), self.scale_cached.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    x1, x2 = x.chunk(2, dim=-1)
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, freqs, scale=1, position_ids=None):
+    freqs = freqs[position_ids, :]
+    if scale.shape[-1] != 1:
+        scale = scale[position_ids, :]
+
+    q_embed = (q * freqs.cos() * scale) + (rotate_half(q) * freqs.sin() * scale)
+    k_embed = (k * freqs.cos() * 1 / scale) + (rotate_half(k) * freqs.sin() * 1 / scale)
+
+    return q_embed, k_embed
+
+
+def replace_llama_rope_with_xpos_rope():
+    transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = XposRotaryEmbedding
+    transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    InstructionPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter


 def load(tokenizer, cfg):
@@ -18,6 +18,42 @@ def load(tokenizer, cfg):
    )


+class AlpacaConcisePrompter(AlpacaPrompter):
+    """
+    Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers
+    """
+
+    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
+    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"
+
+
+class AlpacaChatPrompter(AlpacaPrompter):
+    """
+    Alpaca Chat Prompter extending the system prompt to for chat-instruct answers
+    """
+
+    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
+    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"
+
+    def __init__(self):  # pylint: disable=super-init-not-called
+        self.prompt_style = PromptStyle.CHAT.value
+        self.match_prompt_style()
+
+
+class NoSystemPrompter(AlpacaPrompter):
+    """
+    Null Prompter with no system prompts
+    """
+
+    system_prompt = ""
+    system_no_input_prompt = ""
+    turn_format = "{instruction} {input} "
+    turn_no_input_format = "{instruction} "
+
+    def __init__(self):  # pylint: disable=super-init-not-called
+        pass
+
+
 class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for AlpacaQA
@@ -31,9 +67,49 @@ class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
        )


-def load_qa(tokenizer, cfg):
-    return AlpacaQAPromptTokenizingStrategy(
-        AlpacaPrompter(PromptStyle.CHAT.value),
+class CamelAIPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
+    """
+    Tokenizing strategy for CamelAI datasets
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+        return (
+            prompt["message_1"],
+            "",
+            prompt["message_2"],
+        )
+
+
+def load_concise(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        AlpacaConcisePrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_qa(tokenizer, cfg):
+    return AlpacaQAPromptTokenizingStrategy(
+        AlpacaChatPrompter(),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_camel_ai(tokenizer, cfg):
+    return CamelAIPromptTokenizingStrategy(
+        AlpacaChatPrompter(),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_no_prompt(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        UnpromptedPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
--- a/src/axolotl/prompt_strategies/alpaca_instruct.py
+++ b/src/axolotl/prompt_strategies/alpaca_instruct.py
@@ -1,7 +1,7 @@
 """Module loading the AlpacaInstructPromptTokenizingStrategy class"""

 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter


 def load(tokenizer, cfg):
@@ -11,3 +11,12 @@ def load(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
+
+
+def load_no_prompt(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        UnpromptedPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -0,0 +1,139 @@
+"""
+Prompt strategies loader for alpaca instruction datasets with system prompts
+"""
+from typing import Generator, Tuple, Union
+
+from axolotl.prompt_tokenizers import PromptTokenizingStrategy
+from axolotl.prompters import AlpacaPrompter, PromptStyle
+
+
+class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
+    """
+    Tokenizing strategy for instruction-based prompts.
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
+        return (
+            prompt["instruction"],
+            prompt["input"] if "input" in prompt else "",
+            prompt["output"],
+            prompt["system"],
+        )
+
+    def tokenize_prompt(self, prompt):
+        # pylint: disable=duplicate-code
+        (
+            instruction,
+            input,  # pylint: disable=redefined-builtin
+            response,
+            system,
+        ) = self.parse_instruction_fields(prompt)
+        user_prompt = next(
+            iter(
+                self.prompter.build_prompt_w_system(
+                    system,
+                    instruction,
+                    input,
+                )
+            )
+        )
+        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
+        if not self.train_on_inputs:
+            user_prompt_len = len(tokenized_prompt["input_ids"])
+            # TODO this could be sped up using numpy array slicing
+            tokenized_prompt["labels"] = [-100] * user_prompt_len
+        tokenized_res_prompt = self._tokenize(
+            response, strip_bos_token=True, add_eos_token=True
+        )
+        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
+        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
+        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
+
+        return tokenized_prompt
+
+
+class SystemDataPrompter(AlpacaPrompter):
+    """
+    Alpaca Style Prompter that uses system prompts from the dataset
+    """
+
+    def build_prompt_w_system(
+        self,
+        system: str,
+        instruction: str,
+        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        output: Union[None, str] = None,
+    ) -> Generator[str, None, None]:
+        # returns the full prompt from instruction and optional input
+        # if a label (=response, =output) is provided, it's also appended.
+        formatted_sys_prompt = f"### System:\n{system}\n\n" if system else ""
+        if input:
+            res = formatted_sys_prompt + self.turn_format.format(
+                instruction=instruction, input=input
+            )
+        else:
+            res = formatted_sys_prompt + self.turn_no_input_format.format(
+                instruction=instruction
+            )
+        if output:
+            res = f"{res}{output}"
+        yield res
+
+
+class OpenOrcaSystemDataPrompter(SystemDataPrompter):
+    """
+    Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts
+    """
+
+    def match_prompt_style(self):
+        if self.prompt_style == PromptStyle.INSTRUCT.value:
+            self.turn_format = "### User:\n{instruction}\n\n### Additional Context:\n{input}\n\n### Assistant:\n"
+            self.turn_no_input_format = "### User:\n{instruction}\n\n### Assistant:\n"
+        if self.prompt_style == PromptStyle.CHAT.value:
+            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+
+
+class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
+    """
+    Tokenizing strategy for OpenOrca datasets
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
+        return (
+            prompt["question"],
+            "",
+            prompt["response"],
+            prompt["system_prompt"],
+        )
+
+
+def load(tokenizer, cfg):
+    return load_chat(tokenizer, cfg)
+
+
+def load_instruct(tokenizer, cfg):
+    return InstructionWSystemPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_chat(tokenizer, cfg):
+    return InstructionWSystemPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_open_orca(tokenizer, cfg):
+    return OpenOrcaPromptTokenizingStrategy(
+        OpenOrcaSystemDataPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
--- a/src/axolotl/prompt_strategies/context_qa.py
+++ b/src/axolotl/prompt_strategies/context_qa.py
@@ -0,0 +1,67 @@
+"""Module containing the classes for Context QA Prompt Tokenization Strategies"""
+from typing import Tuple
+
+from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
+from axolotl.prompters import AlpacaPrompter, PromptStyle
+
+
+# article, unanswerable_question, question, answer
+def load_404(tokenizer, cfg):
+    return AlpacaMissingInfoContextPromptTokenizingStrategy(
+        AlpacaContextPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load(tokenizer, cfg):
+    return AlpacaContextPromptTokenizingStrategy(
+        AlpacaContextPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+class AlpacaContextPrompter(AlpacaPrompter):
+    """
+    Customized system prompted for concise QA
+    """
+
+    system_prompt = (
+        "Use the following contextual information to concisely answer the question.\n"
+    )
+    system_no_input_prompt = (
+        "Use the following contextual information to concisely answer the question.\n"
+    )
+
+
+class AlpacaContextPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
+    """
+    Tokenization Strategy to combine in-context article with a question and answer
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+        return (
+            prompt["article"] + "\n===\n" + prompt["question"],
+            "",
+            prompt["answer"],
+        )
+
+
+class AlpacaMissingInfoContextPromptTokenizingStrategy(
+    InstructionPromptTokenizingStrategy
+):
+    """
+    Tokenization Strategy to combine in-context article with a question that can't be answered
+    from the context and a default response to that effect
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+        return (
+            prompt["article"] + "\n===\n" + prompt["unanswerable_question"],
+            "",
+            "The context provided does not contain any information about your inquiry. "
+            "Therefore, I'm unable to answer your question based on the given context.",
+        )
--- a/src/axolotl/prompt_strategies/pygmalion.py
+++ b/src/axolotl/prompt_strategies/pygmalion.py
@@ -11,6 +11,8 @@ from axolotl.prompt_tokenizers import (
    tokenize_prompt_default,
 )

+LOG = logging.getLogger("axolotl")
+
 IGNORE_TOKEN_ID = -100


@@ -64,7 +66,7 @@ class PygmalionPromptTokenizingStrategy(PromptTokenizingStrategy):
                    *copy.deepcopy(res["input_ids"])
                ][len(self.bot_prefix_token_ids) :]
            else:
-                logging.warning(f"unknown role in conversation: {role}")
+                LOG.warning(f"unknown role in conversation: {role}")
                res = defaultdict(lambda: [])

            # pylint: disable=duplicate-code
--- a/src/axolotl/prompt_strategies/sharegpt_jokes.py
+++ b/src/axolotl/prompt_strategies/sharegpt_jokes.py
@@ -0,0 +1,28 @@
+"""Module for Jokes prompts using sharegpt style """
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import PromptStyle, ShareGPTPrompter
+
+
+def load(tokenizer, cfg):
+    return SimpleJokesShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+class SimpleJokesShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    Tokenization strategy for asking bot to tell a joke and then explain why its funny
+    """
+
+    # title, text, explanation
+    def get_conversation_thread(self, prompt):
+        title = "" if not prompt["title"] else prompt["title"] + " "
+        return [
+            {"from": "human", "value": "Tell me a joke."},
+            {"from": "gpt", "value": title + prompt["text"]},
+            {"from": "human", "value": "Why is that joke funny?"},
+            {"from": "gpt", "value": prompt["explanation"]},
+        ]
--- a/src/axolotl/prompt_strategies/sharegpt_simple.py
+++ b/src/axolotl/prompt_strategies/sharegpt_simple.py
@@ -0,0 +1,67 @@
+"""Module containing the SimpleShareGPTPromptTokenizingStrategy class"""
+
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import PromptStyle, ShareGPTPrompter
+
+
+def load(tokenizer, cfg):
+    return SimpleShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_role(tokenizer, cfg):
+    return SimpleRoleShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_guanaco(tokenizer, cfg):
+    return GuanacoShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    basic sharegpt strategy to grab conversations from the sample row
+    """
+
+    def get_conversation_thread(self, prompt):
+        return prompt["conversations"]
+
+
+class SimpleRoleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    basic sharegpt strategy to grab conversations from the sample row, but uses role instead of from
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["conversations"]
+        # remap role: prompter/assistant, text: ... => from: human/gpt, value: ...
+        turns = [{"from": t["role"], "value": t["value"]} for t in conversations]
+        return turns
+
+
+class GuanacoShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    sharegpt strategy that remaps oasst data to sharegpt format
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["conversations"]
+        # remap role: prompter/assistant, text: ... => from: human/gpt, value: ...
+        role_map = {"prompter": "human", "assistant": "gpt"}
+        turns = [
+            {"from": role_map[t["role"]], "value": t["text"]} for t in conversations
+        ]
+        return turns
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -10,6 +10,8 @@ from transformers import PreTrainedTokenizer

 from axolotl.prompters import IGNORE_TOKEN_ID

+LOG = logging.getLogger("axolotl")
+
 IGNORE_INDEX = -100
 LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"  # nosec
 LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
@@ -46,16 +48,22 @@ class PromptTokenizingStrategy(abc.ABC):

    @functools.lru_cache(maxsize=128)
    def _get_user_token(self):
-        id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
-        if isinstance(id_or_ids, (int,)):
-            return id_or_ids
+        try:
+            id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
+            if isinstance(id_or_ids, (int,)):
+                return id_or_ids
+        except KeyError:
+            pass
        return False

    @functools.lru_cache(maxsize=128)
    def _get_assistant_token(self):
-        id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
-        if isinstance(id_or_ids, (int,)):
-            return id_or_ids
+        try:
+            id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
+            if isinstance(id_or_ids, (int,)):
+                return id_or_ids
+        except KeyError:
+            pass
        return False

    def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
@@ -87,7 +95,9 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
    Tokenizing strategy for instruction-based prompts.
    """

-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+    def parse_instruction_fields(
+        self, prompt
+    ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
        raise NotImplementedError

    def tokenize_prompt(self, prompt):
@@ -96,25 +106,27 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
            input,  # pylint: disable=redefined-builtin
            response,
        ) = self.parse_instruction_fields(prompt)
-        full_prompt = self._build_full_prompt(instruction, input, response)
-        tokenized_full_prompt = self._tokenize(full_prompt)
-        if not self.train_on_inputs:
-            user_prompt = next(
-                iter(
-                    self.prompter.build_prompt(
-                        instruction,
-                        input,
-                    )
+        user_prompt = next(
+            iter(
+                self.prompter.build_prompt(
+                    instruction,
+                    input,
                )
            )
-            tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False)
-            user_prompt_len = len(tokenized_user_prompt["input_ids"])
+        )
+        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
+        if not self.train_on_inputs:
+            user_prompt_len = len(tokenized_prompt["input_ids"])
            # TODO this could be sped up using numpy array slicing
-            tokenized_full_prompt["labels"] = [
-                -100
-            ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]
+            tokenized_prompt["labels"] = [-100] * user_prompt_len
+        tokenized_res_prompt = self._tokenize(
+            response, strip_bos_token=True, add_eos_token=True
+        )
+        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
+        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
+        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]

-        return tokenized_full_prompt
+        return tokenized_prompt

    def _build_full_prompt(
        self, instruction, input, response  # pylint: disable=redefined-builtin
@@ -380,7 +392,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                    else:
-                        logging.warning(f"unhandled role: {part[0]}")
+                        LOG.warning(f"unhandled role: {part[0]}")

                # pylint: disable=duplicate-code
                result, current_len = parse_tokenized_to_result(
@@ -436,7 +448,7 @@ def parse_tokenized_to_result(
    result: Dict[str, List[int]],
    current_len: int,
    res: Dict[str, List[int]],
-    labels: list[int],
+    labels: List[int],
    pad_token_id: Union[int, None] = None,
 ) -> Tuple[Dict[str, List[int]], int]:
    """
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -5,6 +5,7 @@ import logging
 from enum import Enum, auto
 from typing import Generator, List, Optional, Tuple, Union

+LOG = logging.getLogger("axolotl")
 IGNORE_TOKEN_ID = -100


@@ -24,6 +25,8 @@ class AlpacaPrompter:

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+    turn_format: str
+    turn_no_input_format: str
    prompt_style: Optional[PromptStyle] = None

    def __init__(self, prompt_style=PromptStyle.INSTRUCT.value):
@@ -32,23 +35,13 @@ class AlpacaPrompter:

    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
-            self.prompt_input = (
-                self.system_prompt
-                + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+            self.turn_no_input_format = (
+                "### Instruction:\n{instruction}\n\n### Response:\n"
            )
-            self.prompt_no_input = (
-                self.system_no_input_prompt
-                + "### Instruction:\n{instruction}\n\n### Response:\n"
-            )
-            self.response_split = "### Response:"
        if self.prompt_style == PromptStyle.CHAT.value:
-            self.prompt_input = (
-                self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
-            )
-            self.prompt_no_input = (
-                self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
-            )
-            self.response_split = "ASSISTANT:"
+            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"

    def build_prompt(
        self,
@@ -59,16 +52,17 @@ class AlpacaPrompter:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
-            res = self.prompt_input.format(instruction=instruction, input=input)
+            res = self.system_prompt + self.turn_format.format(
+                instruction=instruction, input=input
+            )
        else:
-            res = self.prompt_no_input.format(instruction=instruction)
+            res = self.system_no_input_prompt + self.turn_no_input_format.format(
+                instruction=instruction
+            )
        if output:
            res = f"{res}{output}"
        yield res

-    def get_response(self, output: str) -> str:
-        return output.split(self.response_split)[1].strip()
-

 class UnpromptedPrompter(AlpacaPrompter):
    """
@@ -93,7 +87,10 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter):
    """

    system_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning."
+        "Choose the answer that best answers the question. Explain your reasoning.\n"
+    )
+    system_no_input_prompt = (
+        "Choose the answer that best answers the question. Explain your reasoning.\n"
    )


@@ -102,7 +99,12 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter):
    Prompter for multiple choice concise
    """

-    prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"
+    system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
+    system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
+
+    def match_prompt_style(self):
+        self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+        self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"


 class SummarizeTLDRPrompter(AlpacaPrompter):
@@ -110,9 +112,12 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
    Prompter for summarize TLDR
    """

-    prompt_no_input = (
-        "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
-    )
+    system_prompt = ""
+    system_no_input_prompt = ""
+
+    def match_prompt_style(self):
+        self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
+        self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"


 class CompletionPrompter:
@@ -128,9 +133,6 @@ class CompletionPrompter:
    ) -> Generator[str, None, None]:
        yield instruction

-    def get_response(self, output: str) -> str:
-        return output.strip()
-

 class GPTeacherPrompter(AlpacaPrompter):
    """
@@ -210,9 +212,6 @@ class ReflectAlpacaPrompter:
            res = f"{res}{label}"
        yield res

-    def get_response(self, output: str) -> str:
-        return output.split(self.response_split)[1].strip()
-

 class SeparatorStyle(Enum):
    """Different separator style."""
@@ -243,7 +242,7 @@ class Conversation:
            if message:
                yield (role + ":", " " + message)
            else:
-                logging.warning(f"role with empty message: {role}")
+                LOG.warning(f"role with empty message: {role}")
                yield (role + ":", "")

    def copy(self):
@@ -261,34 +260,33 @@ class Conversation:
        self.messages.append([role, message])


-conv_vicuna_v1_1 = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-    roles=["USER", "ASSISTANT"],
-    messages=[],
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2=" ",
-)
-
-
 class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
    """
    A prompter that generates prompts for the ShareGPT
    """

-    def __init__(self, prompt_style=None):
+    def __init__(self, prompt_style=None, system_prompt: Optional[str] = None):
        if prompt_style != PromptStyle.CHAT.value:
            raise ValueError(
                f"unsupported prompt_style for ShareGPTPrompter({prompt_style})"
            )
-
-    # def match_prompt_style(self):
-    #     if self.prompt_style == PromptStyle.chat.value:
-    #         self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
-    #         self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
-    #         self.response_split = "ASSISTANT:"
+        system: str = (
+            system_prompt
+            if system_prompt
+            else (
+                "A chat between a curious user and an artificial intelligence assistant. "
+                "The assistant gives helpful, detailed, and polite answers to the user's questions."
+            )
+        )
+        self._conversation = Conversation(
+            system=system,
+            roles=["USER", "ASSISTANT"],
+            messages=[],
+            offset=0,
+            sep_style=SeparatorStyle.TWO,
+            sep=" ",
+            sep2=" ",
+        )

    def build_prompt(self, source) -> Generator[str, None, None]:
        # ignore the system prompt if provided
@@ -300,7 +298,7 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
            # also happens on the data splitting leaving empty conversations
            raise IndexError

-        conv = conv_vicuna_v1_1.copy()
+        conv = self._conversation.copy()
        roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

        try:
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -2,13 +2,14 @@

 import os

+from optimum.bettertransformer import BetterTransformer
 from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy


 class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-methods
@@ -30,3 +31,39 @@ class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-
        kwargs["model"].save_pretrained(peft_model_path)

        return control
+
+
+class SaveBetterTransformerModelCallback(
+    TrainerCallback
+):  # pylint: disable=too-few-public-methods
+    """Callback to save the BetterTransformer wrapped model"""
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        # Save
+        if (
+            args.save_strategy == IntervalStrategy.STEPS
+            and args.save_steps > 0
+            and state.global_step % args.save_steps == 0
+        ):
+            control.should_save = True
+
+        if control.should_save:
+            checkpoint_folder = os.path.join(
+                args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+            )
+
+            model = BetterTransformer.reverse(kwargs["model"])
+            model.save_pretrained(checkpoint_folder)
+            # FIXME - need to cleanup old checkpoints
+
+            # since we're saving here, we don't need the trainer loop to attempt to save too b/c
+            # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
+            control.should_save = False
+        return control
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,10 +1,12 @@
 """Module containing data utilities"""
-
+import functools
+import itertools
 import logging
 from hashlib import md5
 from pathlib import Path
 from typing import List, Tuple, Union

+import torch
 from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 from huggingface_hub import hf_hub_download
 from transformers import PreTrainedTokenizerBase
@@ -34,6 +36,8 @@ from axolotl.prompters import (
    SummarizeTLDRPrompter,
 )

+LOG = logging.getLogger("axolotl")
+

 def load_tokenized_prepared_datasets(
    tokenizer, cfg, default_dataset_prepared_path
@@ -72,12 +76,19 @@ def load_tokenized_prepared_datasets(
    if dataset:
        ...
    elif any(prepared_ds_path.glob("*")):
-        logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
        dataset = load_from_disk(str(prepared_ds_path))
-        logging.info("Prepared dataset loaded from disk...")
+        LOG.info("Prepared dataset loaded from disk...")
    else:
-        logging.info(f"Unable to find prepared dataset in {prepared_ds_path}")
-        logging.info("Loading raw datasets...")
+        LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
+        LOG.info("Loading raw datasets...")
+
+        if cfg.seed:
+            seed = cfg.seed
+        else:
+            LOG.info("No seed provided, using default seed of 42")
+            seed = 42
+
        datasets = []
        # pylint: disable=invalid-name
        for d in cfg.datasets:
@@ -86,6 +97,7 @@ def load_tokenized_prepared_datasets(
            try:
                load_dataset(
                    d.path,
+                    name=d.name,
                    streaming=True,
                    use_auth_token=use_auth_token,
                )
@@ -94,44 +106,55 @@ def load_tokenized_prepared_datasets(
                pass

            # prefer local dataset, even if hub exists
-            if Path(d.path).exists():
-                ds = load_dataset(
-                    "json",
-                    data_files=d.path,
-                    streaming=False,
-                    split=None,
-                )
-            elif ds_from_hub:
-                if d.data_files:
+            local_path = Path(d.path)
+            if local_path.exists():
+                if local_path.is_dir():
                    ds = load_dataset(
                        d.path,
-                        streaming=False,
+                        name=d.name,
                        data_files=d.data_files,
-                        use_auth_token=use_auth_token,
+                        streaming=False,
+                        split=None,
+                    )
+                elif local_path.is_file():
+                    ds = load_dataset(
+                        "json",
+                        name=d.name,
+                        data_files=d.path,
+                        streaming=False,
+                        split=None,
                    )
                else:
-                    ds = load_dataset(
-                        d.path,
-                        streaming=False,
-                        use_auth_token=use_auth_token,
+                    raise ValueError(
+                        "unhandled dataset load: local path exists, but is neither a directory or a file"
                    )
+            elif ds_from_hub:
+                ds = load_dataset(
+                    d.path,
+                    name=d.name,
+                    streaming=False,
+                    data_files=d.data_files,
+                    use_auth_token=use_auth_token,
+                )
            else:
                fp = hf_hub_download(
                    repo_id=d.path,
                    repo_type="dataset",
                    filename=d.data_files,
                )
-                ds = load_dataset("json", data_files=fp, streaming=False, split=None)
+                ds = load_dataset(
+                    "json", name=d.name, data_files=fp, streaming=False, split=None
+                )
            if not ds:
                raise ValueError("unhandled dataset load")
            # support for using a subset of the data
            if d.shards:
                if "train" in ds:
-                    ds = ds.shuffle(seed=42)["train"].shard(
+                    ds = ds.shuffle(seed=seed)["train"].shard(
                        num_shards=d.shards, index=0
                    )
                else:
-                    ds = ds.shuffle(seed=42).shard(num_shards=d.shards, index=0)
+                    ds = ds.shuffle(seed=seed).shard(num_shards=d.shards, index=0)
            d_type = d.type
            d_type_split = d_type.split(":")
            d_base_type = d_type_split[0]
@@ -232,21 +255,32 @@ def load_tokenized_prepared_datasets(
                ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
                datasets.append(ds_wrapper)
            else:
-                logging.error(f"unhandled prompt tokenization strategy: {d.type}")
-                raise ValueError(f"unhandled prompt tokenization strategy: {d.type}")
-        logging.info("tokenizing, merging, and shuffling master dataset")
+                suffix = ""
+                if ":load_" in d.type:
+                    suffix = f" Did you mean {d.type.replace(':load_', '.load_')}?"
+                LOG.error(f"unhandled prompt tokenization strategy: {d.type}. {suffix}")
+                raise ValueError(
+                    f"unhandled prompt tokenization strategy: {d.type} {suffix}"
+                )
+        LOG.info("tokenizing, merging, and shuffling master dataset")

        samples: List[int] = []
+        chunk_size = 1000
        for d in datasets:
-            samples = samples + list(d)
-        dataset = Dataset.from_list(samples).shuffle(seed=42)
+            d_iter = iter(d)
+            while True:
+                chunk = list(itertools.islice(d_iter, chunk_size))
+                if not chunk:
+                    break
+                samples.extend(chunk)
+
+        LOG.info("shuffle")
+        dataset = Dataset.from_list(samples).shuffle(seed=seed)
        if cfg.local_rank == 0:
-            logging.info(
-                f"Saving merged prepared dataset to disk... {prepared_ds_path}"
-            )
+            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
            dataset.save_to_disk(prepared_ds_path)
            if cfg.push_dataset_to_hub:
-                logging.info(
+                LOG.info(
                    f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset.push_to_hub(
@@ -297,7 +331,7 @@ def load_prepare_datasets(
        use_auth_token = cfg.hf_use_auth_token
        try:
            if cfg.push_dataset_to_hub:
-                logging.info(
+                LOG.info(
                    f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset = load_dataset(
@@ -311,13 +345,13 @@ def load_prepare_datasets(
        if dataset:
            ...
        elif any(prepared_ds_path.glob("*")):
-            logging.info(
+            LOG.info(
                f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
            )
            dataset = load_from_disk(str(prepared_ds_path))
-            logging.info("Prepared packed dataset loaded from disk...")
+            LOG.info("Prepared packed dataset loaded from disk...")
            if cfg.push_dataset_to_hub:
-                logging.info(
+                LOG.info(
                    f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset.push_to_hub(
@@ -336,9 +370,7 @@ def load_prepare_datasets(
                [dataset],
                seq_length=max_packed_sequence_len,
            )
-            logging.info(
-                f"packing master dataset to len: {cfg.max_packed_sequence_len}"
-            )
+            LOG.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
            dataset = Dataset.from_list(list(constant_len_dataset))

            # filter out bad data
@@ -354,12 +386,12 @@ def load_prepare_datasets(
            )

            if cfg.local_rank == 0:
-                logging.info(
+                LOG.info(
                    f"Saving packed prepared dataset to disk... {prepared_ds_path}"
                )
                dataset.save_to_disk(prepared_ds_path)
                if cfg.push_dataset_to_hub:
-                    logging.info(
+                    LOG.info(
                        f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                    )
                    dataset.push_to_hub(
@@ -372,7 +404,7 @@ def load_prepare_datasets(
        )

    if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
-        logging.info(
+        LOG.info(
            f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
        )
        dataset = dataset.shard(
@@ -380,8 +412,127 @@ def load_prepare_datasets(
            index=cfg.dataset_shard_idx,
        )

-    dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
-    train_dataset = dataset["train"]
-    eval_dataset = dataset["test"]
+    if cfg.val_set_size:
+        dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
+        train_dataset = dataset["train"]
+        eval_dataset = dataset["test"]
+    else:
+        train_dataset = dataset
+        eval_dataset = None

    return train_dataset, eval_dataset
+
+
+def encode_pretraining(tokenizer, max_tokens, examples):
+    res = tokenizer(
+        examples["text"],
+        truncation=True,
+        max_length=max_tokens - 2,
+        add_special_tokens=True,
+    )
+    # Convert to PyTorch tensors
+    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
+    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
+    new_input_ids = []
+    new_attention_mask = []
+    # Append EOS and PAD tokens to input_ids, and correct attention_mask
+    for i, _ in enumerate(input_ids):
+        input_ids[i] = torch.cat(
+            (
+                input_ids[i],
+                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
+            ),
+            dim=0,
+        )
+        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
+
+    # Concatenate tokens so that their lengths are less than max_tokens
+    buffer_input_ids = torch.tensor([], dtype=torch.long)
+    buffer_attention_mask = torch.tensor([], dtype=torch.long)
+
+    for ids, mask in zip(input_ids, attention_mask):
+        if buffer_input_ids.numel() == max_tokens:
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        else:
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+
+    if buffer_input_ids.numel() > 0:  # for any leftover tokens
+        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+        new_input_ids.append(buffer_input_ids)
+        new_attention_mask.append(buffer_attention_mask)
+
+    ret = {
+        "input_ids": [seq.tolist() for seq in new_input_ids],
+        "labels": [seq.tolist() for seq in new_input_ids],
+        "attention_mask": [seq.tolist() for seq in new_attention_mask],
+    }
+
+    LOG.debug(len(ret["input_ids"]))
+    return ret
+
+
+def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
+    encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
+    dataset = load_dataset(path, streaming=True, split="train")
+    dataset = dataset.shuffle(seed=seed, buffer_size=10_000)
+    # TODO dynamically figure out which columns/features to remove
+    dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"])
+    return dataset
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -10,24 +10,21 @@ from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401
 import bitsandbytes as bnb
 import torch
 import transformers
-from transformers import PreTrainedModel  # noqa: F401
+from optimum.bettertransformer import BetterTransformer
 from transformers import (  # noqa: F401
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
 )

-try:
-    from transformers import LlamaForCausalLM
-except ImportError:
-    logging.warning(
-        "This version of transformers does not support Llama. Consider upgrading."
-    )
-
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN

+LOG = logging.getLogger("axolotl")
+
 if TYPE_CHECKING:
    from peft import PeftConfig  # noqa: F401

@@ -39,21 +36,26 @@ def load_tokenizer(
    tokenizer_type,
    cfg,
 ):
+    use_fast = True  # this is the default
+    if cfg.tokenizer_use_fast is not None:
+        use_fast = cfg.tokenizer_use_fast
    if tokenizer_type:
        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
+            use_fast=use_fast,
        )
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_config,
            trust_remote_code=cfg.trust_remote_code or False,
+            use_fast=use_fast,
        )

-    logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-    logging.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-    logging.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-    logging.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")

    if tokenizer.__class__.__name__ in [
        "LlamaTokenizer",
@@ -75,49 +77,64 @@ def load_tokenizer(


 def load_model(
-    base_model,
-    base_model_config,
-    model_type,
-    tokenizer,
-    cfg,
-    adapter="lora",
-    inference=False,
+    base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
 ):
-    # type: (str, str, str, str, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+    # type: (str, str, str, PreTrainedTokenizerBase, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
    """
    Load a model from a base model and a model type.
    """

    # TODO refactor as a kwarg
    load_in_8bit = cfg.load_in_8bit
-    is_llama_derived_model = "llama" in base_model or (
+    cfg.is_llama_derived_model = "llama" in base_model or (
        cfg.model_type and "llama" in cfg.model_type.lower()
    )

-    if is_llama_derived_model and cfg.flash_attention:
-        if cfg.device not in ["mps", "cpu"] and inference is False:
-            from axolotl.flash_attn import replace_llama_attn_with_flash_attn
+    if cfg.is_llama_derived_model and cfg.flash_attention:
+        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
+            from axolotl.monkeypatch.llama_attn_hijack_flash import (
+                replace_llama_attn_with_flash_attn,
+            )

-            logging.info("patching with flash attention")
+            LOG.info("patching with flash attention")
            replace_llama_attn_with_flash_attn()
-    elif is_llama_derived_model and cfg.xformers_attention:
+    elif cfg.is_llama_derived_model and cfg.xformers_attention:
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_attention,
        )

-        logging.info("patching with xformers attention")
+        LOG.info("patching with xformers attention")
        hijack_llama_attention()
-    elif is_llama_derived_model and cfg.sdp_attention:
+    elif cfg.is_llama_derived_model and cfg.sdp_attention:
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_sdp_attention,
        )

-        logging.info("patching with sdp attention")
+        LOG.info("patching with sdp attention")
        hijack_llama_sdp_attention()
+    elif cfg.is_llama_derived_model and cfg.landmark_attention:
+        from axolotl.monkeypatch.llama_landmark_attn import (
+            MEM_TOKEN,
+            patch_llama_with_landmark_attn,
+        )

-    if cfg.bf16:
+        LOG.info("patching with landmark attention")
+        patch_llama_with_landmark_attn()
+
+        # Note: This might overwrite previous additional_special_tokens
+        tokenizer.add_special_tokens({"additional_special_tokens": [MEM_TOKEN]})
+
+    if cfg.is_llama_derived_model and cfg.xpos_rope:
+        from axolotl.monkeypatch.xpos_rope_llama_monkey_patch import (
+            replace_llama_rope_with_xpos_rope,
+        )
+
+        LOG.info("patching with xpos rope")
+        replace_llama_rope_with_xpos_rope()
+
+    if cfg.bf16 or cfg.bfloat16:
        torch_dtype = torch.bfloat16
-    elif cfg.load_in_8bit or cfg.fp16:
+    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
        torch_dtype = torch.float16
    else:
        torch_dtype = torch.float32
@@ -128,12 +145,25 @@ def load_model(
            )

            replace_peft_model_with_int4_lora_model()
-        from peft import prepare_model_for_int8_training
    except Exception as err:
-        logging.exception(err)
+        LOG.exception(err)
        raise err

+    if not cfg.gptq and (
+        (cfg.adapter == "lora" and load_in_8bit)
+        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
+    ):
+        try:
+            from peft import prepare_model_for_kbit_training
+        except ImportError:
+            # For backward compatibility
+            from peft import (
+                prepare_model_for_int8_training as prepare_model_for_kbit_training,
+            )
+
    model_kwargs = {}
+    if cfg.model_revision:
+        model_kwargs["revision"] = cfg.model_revision
    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -144,7 +174,7 @@ def load_model(
            bnb_4bit_quant_type="nf4",
        )
    try:
-        if cfg.gptq and is_llama_derived_model:
+        if cfg.gptq and cfg.is_llama_derived_model:
            from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
            from huggingface_hub import snapshot_download

@@ -165,7 +195,7 @@ def load_model(
                if len(files) > 0:
                    model_path = str(files[0])
                else:
-                    logging.warning(
+                    LOG.warning(
                        "unable to find a cached model file, this will likely fail..."
                    )
                    model_path = str(cache_model_path)
@@ -182,7 +212,9 @@ def load_model(
                else True,
            )
            load_in_8bit = False
-        elif is_llama_derived_model and "LlamaForCausalLM" in globals():
+        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
+            from transformers import LlamaForCausalLM
+
            config = LlamaConfig.from_pretrained(base_model_config)
            model = LlamaForCausalLM.from_pretrained(
                base_model,
@@ -219,7 +251,7 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type:
+        elif model_type and not cfg.trust_remote_code:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
@@ -234,6 +266,22 @@ def load_model(
                base_model,
                trust_remote_code=cfg.trust_remote_code or False,
            )
+            # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
+            # when training starts
+            if (
+                hasattr(config, "max_seq_len")
+                and config.max_seq_len
+                and cfg.sequence_len > config.max_seq_len
+            ):
+                config.max_seq_len = cfg.sequence_len
+                LOG.warning(f"increasing context length to {cfg.sequence_len}")
+            elif (
+                hasattr(config, "max_sequence_length")
+                and config.max_sequence_length
+                and cfg.sequence_len > config.max_sequence_length
+            ):
+                config.max_sequence_length = cfg.sequence_len
+                LOG.warning(f"increasing context length to {cfg.sequence_len}")
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                config=config,
@@ -245,28 +293,55 @@ def load_model(
                **model_kwargs,
            )
    except Exception as err:  # pylint: disable=broad-exception-caught
-        logging.error(
+        LOG.error(
            "Exception raised attempting to load model, retrying with AutoModelForCausalLM"
        )
-        logging.exception(err)
+        LOG.exception(err)
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
+            load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
            torch_dtype=torch_dtype,
            device_map=cfg.device_map,
            trust_remote_code=cfg.trust_remote_code or False,
            **model_kwargs,
        )

-    embeddings_len = math.ceil(len(tokenizer) / 32) * 32
+    embeddings_len = (
+        math.ceil(len(tokenizer) / 32) * 32
+        if cfg.resize_token_embeddings_to_32x
+        else len(tokenizer)
+    )
    model.resize_token_embeddings(embeddings_len)

+    if (
+        hasattr(model.config, "max_position_embeddings")
+        and model.config.max_position_embeddings
+        and cfg.sequence_len >= model.config.max_position_embeddings
+    ):
+        LOG.warning(
+            f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
+        )
+        model.config.max_position_embeddings = cfg.sequence_len
+
    if not cfg.gptq and (
        (cfg.adapter == "lora" and load_in_8bit)
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
    ):
-        logging.info("converting PEFT model w/ prepare_model_for_int8_training")
-        model = prepare_model_for_int8_training(model)
+        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
+        model = prepare_model_for_kbit_training(
+            model, use_gradient_checkpointing=cfg.gradient_checkpointing
+        )
+
+        # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
+        # convert them back to fp16/bf16 for flash-attn compatibility.
+        if cfg.flash_attention and cfg.is_llama_derived_model:
+            for name, module in model.named_modules():
+                if "norm" in name:
+                    module.to(torch_dtype)
+                if "lm_head" in name or "embed_tokens" in name:
+                    if hasattr(module, "weight"):
+                        module.to(torch_dtype)

    model, lora_config = load_adapter(model, cfg, adapter)

@@ -275,7 +350,7 @@ def load_model(

    if cfg.gptq:
        # Scales to half
-        logging.info("Fitting 4bit scales and zeros to half")
+        LOG.info("Fitting 4bit scales and zeros to half")
        for _, module in model.named_modules():
            if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
                type(module)
@@ -301,9 +376,12 @@ def load_model(
        if param.requires_grad:
            requires_grad.append(f"{name}: {param.requires_grad}")
    if len(requires_grad) == 0:
-        logging.warning("there are no parameters that require gradient updates")
+        LOG.warning("there are no parameters that require gradient updates")
    model.config.use_cache = False

+    if cfg.flash_optimum:
+        model = BetterTransformer.transform(model)
+
    # TODO resume_from_checkpoint handling
    return model, lora_config

@@ -332,11 +410,10 @@ def load_llama_adapter(model, cfg):
    )

    if cfg.lora_model_dir:
-        logging.info("Loading pretained LORA")
+        LOG.info("Loading pretained LORA")
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
-            device_map=cfg.device_map,
            torch_dtype=torch.float16,
        )
    else:
@@ -380,7 +457,7 @@ def load_lora(model, cfg):
            bits = 8

        linear_names = find_all_linear_names(bits, model)
-        logging.info(f"found linear modules: {repr(linear_names)}")
+        LOG.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))

    lora_config = LoraConfig(
@@ -398,8 +475,7 @@ def load_lora(model, cfg):
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
-            device_map=cfg.device_map,
-            # torch_dtype=torch.float16,
+            is_trainable=not cfg.inference,
        )
    else:
        model = get_peft_model(model, lora_config)
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -1,6 +1,9 @@
 """Module for custom LRScheduler class"""
+import math
+from functools import partial

-from torch.optim.lr_scheduler import LRScheduler
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR, LRScheduler


 class InterpolatingLogScheduler(LRScheduler):
@@ -42,3 +45,58 @@ class InterpolatingLogScheduler(LRScheduler):
            lrs = [self.max_lr for base_lr in self.base_lrs]

        return lrs
+
+
+def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float
+):
+    if current_step < num_warmup_steps:
+        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    return max(
+        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+    )
+
+
+def get_cosine_schedule_with_quadratic_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    lr_lambda = partial(
+        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -5,6 +5,8 @@ import logging

 from termcolor import colored

+LOG = logging.getLogger("axolotl")
+

 def check_dataset_labels(dataset, tokenizer):
    # the dataset is already shuffled, so let's just check the first 5 elements
@@ -32,5 +34,7 @@ def check_example_labels(example, tokenizer):
        )
        colored_tokens.append(colored_token)

-    logging.info(" ".join(colored_tokens))
-    logging.info("\n\n\n")
+    LOG.info(" ".join(colored_tokens))
+    LOG.info("\n\n\n")
+
+    return " ".join(colored_tokens)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,9 +1,11 @@
 """Module containing the Trainer class and related functions"""

 import importlib
+import logging
 import math
 import os
 import sys
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional

@@ -12,14 +14,70 @@ import torch.cuda
 import transformers
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
-from transformers import EarlyStoppingCallback, Trainer
+from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
 from transformers.trainer_pt_utils import get_parameter_names

-from axolotl.utils.callbacks import SavePeftModelCallback
-from axolotl.utils.schedulers import InterpolatingLogScheduler
+from axolotl.utils.callbacks import (
+    SaveBetterTransformerModelCallback,
+    SavePeftModelCallback,
+)
+from axolotl.utils.schedulers import (
+    InterpolatingLogScheduler,
+    get_cosine_schedule_with_quadratic_warmup,
+)
+
+LOG = logging.getLogger("axolotl")


-class OneCycleLRSchedulerTrainer(Trainer):
+@dataclass
+class AxolotlTrainingArguments(TrainingArguments):
+    """
+    Extend the base TrainingArguments for axolotl helpers
+    """
+
+    lr_quadratic_warmup: bool = field(
+        default=False,
+        metadata={"help": "Use quadratic warmup for cosine scheduling."},
+    )
+
+
+class AxolotlTrainer(Trainer):
+    """
+    Extend the base Trainer for axolotl helpers
+    """
+
+    args = None  # type: AxolotlTrainingArguments
+
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
+    ):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+            optimizer (torch.optim.Optimizer): The training optimizer
+        """
+
+        # fmt: off
+        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
+            # fmt: on
+            if (
+                self.args.lr_scheduler_type == "cosine"
+                and self.args.lr_quadratic_warmup is True
+            ):
+                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
+                    optimizer,
+                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                    num_training_steps=num_training_steps,
+                )
+            else:
+                return super().create_scheduler(num_training_steps, optimizer)
+        return self.lr_scheduler
+
+
+class OneCycleLRSchedulerTrainer(AxolotlTrainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
    """
@@ -62,8 +120,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.logging_steps is not None
        else max(min(int(0.005 * total_num_steps), 10), 1)
    )
-    save_steps = cfg.save_steps
-    eval_steps = cfg.eval_steps

    training_arguments_kwargs = {}
    if cfg.bf16 == "full":
@@ -74,6 +130,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    training_arguments_kwargs["tf32"] = cfg.tf32
    training_arguments_kwargs["warmup_steps"] = warmup_steps
    training_arguments_kwargs["logging_steps"] = logging_steps
+
+    if cfg.seed:
+        training_arguments_kwargs["seed"] = cfg.seed
+
    if cfg.gradient_checkpointing:
        if cfg.gptq:
            from alpaca_lora_4bit.gradient_checkpointing import (
@@ -97,6 +157,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.fsdp_config:
            training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)

+    if cfg.lr_quadratic_warmup is not None:
+        training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
+
    # deepspeed
    if (
        os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -109,7 +172,24 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
            # TODO search Path("./") for one
            training_arguments_kwargs["deepspeed"] = "./ds_config.json"

-    training_args = transformers.TrainingArguments(
+    if cfg.adam_beta1:
+        training_arguments_kwargs["adam_beta1"] = cfg.adam_beta1
+    if cfg.adam_beta2:
+        training_arguments_kwargs["adam_beta2"] = cfg.adam_beta2
+    if cfg.adam_epsilon:
+        training_arguments_kwargs["adam_epsilon"] = cfg.adam_epsilon
+    if cfg.max_grad_norm:
+        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
+
+    if cfg.hub_model_id:
+        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
+        training_arguments_kwargs["push_to_hub"] = True
+        training_arguments_kwargs["hub_private_repo"] = True
+
+    if cfg.save_safetensors:
+        training_arguments_kwargs["save_safetensors"] = cfg.save_safetensors
+
+    training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -119,16 +199,16 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        num_train_epochs=cfg.num_epochs,
        learning_rate=cfg.learning_rate,
        evaluation_strategy="steps" if cfg.val_set_size > 0 else "no",
-        save_strategy="steps" if save_steps else "epoch",
-        eval_steps=eval_steps if cfg.val_set_size > 0 else None,
-        save_steps=save_steps,
+        save_strategy="steps" if cfg.save_steps else "epoch",
+        eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None,
+        save_steps=cfg.save_steps,
        output_dir=cfg.output_dir,
        save_total_limit=3,
        load_best_model_at_end=(
            cfg.load_best_model_at_end is not False
            and cfg.val_set_size > 0
-            and save_steps
-            and save_steps % eval_steps == 0
+            and cfg.save_steps
+            and cfg.save_steps % cfg.eval_steps == 0
            and cfg.load_in_8bit is not True
        )
        or False,
@@ -225,6 +305,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    ]:  # only save in rank 0
        callbacks.append(SavePeftModelCallback)

+    if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
+        callbacks.append(SaveBetterTransformerModelCallback)
+
    data_collator_kwargs = {
        "padding": True,
    }
@@ -233,10 +316,30 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    else:
        data_collator_kwargs["pad_to_multiple_of"] = 8

+    if cfg.is_llama_derived_model and cfg.landmark_attention:
+        from functools import partial
+
+        from axolotl.monkeypatch.llama_landmark_attn import (
+            add_mem_tokens,
+            get_mem_id,
+            set_model_mem_id,
+        )
+
+        set_model_mem_id(model, tokenizer)
+
+        LOG.info("Adding landmark attention tokens to dataset")
+
+        for dataset in [train_dataset, eval_dataset]:
+            dataset = dataset.map(
+                partial(add_mem_tokens, mem_freq=50, mem_id=get_mem_id(tokenizer)),
+                batched=False,
+                num_proc=32,
+            )
+
    trainer_cls = (
        OneCycleLRSchedulerTrainer
        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else transformers.Trainer
+        else AxolotlTrainer
    )
    trainer = trainer_cls(
        model=model,
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -2,12 +2,22 @@

 import logging

+import torch
+
+LOG = logging.getLogger("axolotl")
+

 def validate_config(cfg):
    if cfg.gradient_accumulation_steps and cfg.batch_size:
        raise ValueError(
            "please set only one of gradient_accumulation_steps or batch_size"
        )
+    if cfg.batch_size:
+        LOG.warning(
+            "%s\n%s",
+            "batch_size is not recommended. Please use gradient_accumulation_steps instead.",
+            "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
+        )
    if cfg.load_4bit:
        raise ValueError(
            "cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"
@@ -36,10 +46,10 @@ def validate_config(cfg):
                raise ValueError("Require cfg.load_in_4bit to be True for qlora")

    if not cfg.load_in_8bit and cfg.adapter == "lora":
-        logging.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
+        LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")

    if cfg.trust_remote_code:
-        logging.warning(
+        LOG.warning(
            "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
        )

@@ -48,7 +58,53 @@ def validate_config(cfg):
            "Require cfg.hf_use_auth_token to be True for push_dataset_to_hub"
        )

+    if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
+        raise ValueError("FSDP is not supported for falcon models")
+
+    if (
+        cfg.base_model and "mpt" in cfg.base_model.lower()
+    ) and cfg.gradient_checkpointing:
+        raise ValueError("gradient_checkpointing is not supported for MPT models")
+
+    if cfg.flash_optimum is True:
+        if cfg.adapter:
+            LOG.warning("BetterTransformers probably doesn't work with PEFT adapters")
+        if cfg.fp16 or cfg.bf16:
+            raise ValueError("AMP is not supported with BetterTransformer")
+        if cfg.float16 is not True and cfg.bloat16 is not True:
+            LOG.warning(
+                "You should probably set bfloat16 or float16 to true to "
+                "load the model in float16 for BetterTransformers"
+            )
+        if int(torch.__version__.split(".")[0]) < 2:
+            LOG.warning("torch>=2.0.0 required")
+            raise ValueError(
+                f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
+            )
+
+    if cfg.pretraining_dataset and cfg.group_by_length:
+        LOG.warning(
+            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
+        )
+
+    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
+        not cfg.optimizer or "adamw" not in cfg.optimizer
+    ):
+        LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
+
+    if cfg.push_to_hub_model_id:
+        raise ValueError(
+            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
+        )
+
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
-    # no 8bit adamw w bf16
+    # no 8bit adaAmw w bf16
+
+    # GPT-NeoX
+    # evals broken when extending context len
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward                        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product
+    # attention_mask = causal_mask + attention_mask
+    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3
--- a/src/axolotl/utils/wandb.py
+++ b/src/axolotl/utils/wandb.py
@@ -15,3 +15,5 @@ def setup_wandb_env_vars(cfg):
            os.environ["WANDB_LOG_MODEL"] = cfg.wandb_log_model
        if cfg.wandb_run_id and len(cfg.wandb_run_id) > 0:
            os.environ["WANDB_RUN_ID"] = cfg.wandb_run_id
+    else:
+        os.environ["WANDB_DISABLED"] = "true"
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -6,10 +6,18 @@ from pathlib import Path

 from transformers import AutoTokenizer

-from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
-from axolotl.prompters import ShareGPTPrompter
+from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
+from axolotl.prompt_strategies.alpaca_w_system import (
+    InstructionWSystemPromptTokenizingStrategy,
+    SystemDataPrompter,
+)
+from axolotl.prompt_tokenizers import (
+    AlpacaPromptTokenizingStrategy,
+    ShareGPTPromptTokenizingStrategy,
+)
+from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter

-logging.basicConfig(level="INFO")
+LOG = logging.getLogger("axolotl")


 class TestPromptTokenizationStrategies(unittest.TestCase):
@@ -29,7 +37,6 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
        )

    def test_sharegpt_integration(self):
-        print(Path(__file__).parent)
        with open(
            Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8"
        ) as fin:
@@ -53,6 +60,80 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
            self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
            self.assertEqual(example[fields], tokenized_conversation[fields])

+    def test_no_sys_prompt(self):
+        """
+        tests the interface between the user and assistant parts
+        """
+        prompter = NoSystemPrompter()
+        # pylint: disable=duplicate-code
+        strat = AlpacaPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        sample = {
+            "instruction": "hello cruel. lorem ipsum dolor sit amet.",
+            "output": "world!",
+        }
+        example = strat.tokenize_prompt(sample)
+        world_idx = example["input_ids"].index(3186)
+        assert example["labels"][world_idx] == 3186
+        assert example["labels"][world_idx - 1] == -100
+
+    def test_alpaca(self):
+        """
+        tests the interface between the user and assistant parts
+        """
+        # pylint: disable=duplicate-code
+        prompter = AlpacaPrompter()
+        strat = AlpacaPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        sample = {"instruction": "hello!", "output": "Hi! How can I help?"}
+        example = strat.tokenize_prompt(sample)
+        world_idx = example["input_ids"].index(6324)
+        assert example["labels"][world_idx] == 6324
+        assert example["labels"][world_idx - 1] == -100
+
+
+class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
+    """
+    Test class for prompt tokenization strategies with sys prompt from the dataset
+    """
+
+    def setUp(self) -> None:
+        # pylint: disable=duplicate-code
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(
+            {
+                "bos_token": "<s>",
+                "eos_token": "</s>",
+                "unk_token": "<unk>",
+            }
+        )
+
+    def test_system_alpaca(self):
+        prompter = SystemDataPrompter(PromptStyle.CHAT.value)
+        strat = InstructionWSystemPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        sample = {
+            "system": "use cot",
+            "instruction": "hello!",
+            "output": "Hi! How can I help?",
+        }
+        example = strat.tokenize_prompt(sample)
+        assert example["input_ids"][0:4] == [1, 835, 2184, 29901]  # "<s>### System:"
+        assert example["input_ids"][5:7] == [1509, 20118]  # "use cot"
+        assert example["input_ids"][9] == 11889  # USER
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -2,7 +2,13 @@

 import unittest

-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
+from axolotl.prompters import (
+    AlpacaPrompter,
+    MultipleChoiceExplainPrompter,
+    PromptStyle,
+    UnpromptedPrompter,
+)


 class AlpacaPrompterTest(unittest.TestCase):
@@ -55,3 +61,64 @@ class AlpacaPrompterTest(unittest.TestCase):
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res
+
+    def test_system_prompt(self):
+        prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(
+            prompter.build_prompt_w_system(
+                "use cot", "tell me a joke about the following", "alpacas"
+            )
+        )
+        assert "use cot" in res
+        assert res.startswith("### System:")
+        assert "### Instruction:" not in res
+        assert "### Input:" not in res
+        assert "alpacas" in res
+        assert "### Response:" not in res
+        assert "USER:" in res
+        assert "ASSISTANT:" in res
+
+
+class UnpromptedPrompterTest(unittest.TestCase):
+    """
+    Test class for UnpromptedPrompter with no system prompts
+    """
+
+    def test_prompt_style_w_none(self):
+        prompter = UnpromptedPrompter(prompt_style=None)
+        res = next(prompter.build_prompt("tell me a joke"))
+        assert "### Instruction:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("###")
+
+    def test_prompt_style_w_instruct(self):
+        prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
+        res = next(
+            prompter.build_prompt("tell me a joke about the following", "alpacas")
+        )
+        assert "### Instruction:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("###")
+
+    def test_prompt_style_w_chat(self):
+        prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(
+            prompter.build_prompt("tell me a joke about the following", "alpacas")
+        )
+        assert "USER:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("USER:")
+
+
+class MultipleChoiceExplainPrompterTest(unittest.TestCase):
+    """
+    Test class for MultipleChoiceExplainPrompter
+    """
+
+    def test_prompt_style_w_chat(self):
+        prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
+        assert "USER:" in res
+        assert "choose one" in res
+        assert "Choose the answer that best answers the question." in res
+        assert "- A\n- B\n- C" in res
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -0,0 +1,31 @@
+"""
+Test cases for the tokenizer loading
+"""
+import unittest
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_tokenizer
+
+
+class TestTokenizers(unittest.TestCase):
+    """
+    test class for the load_tokenizer fn
+    """
+
+    def test_default_use_fast(self):
+        cfg = DictDefault({})
+        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        assert "Fast" in tokenizer.__class__.__name__
+
+    def test_dont_use_fast(self):
+        cfg = DictDefault(
+            {
+                "tokenizer_use_fast": False,
+            }
+        )
+        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        assert "Fast" not in tokenizer.__class__.__name__
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -1,6 +1,8 @@
 """Module for testing the validation module"""

+import logging
 import unittest
+from typing import Optional

 import pytest

@@ -13,6 +15,12 @@ class ValidationTest(unittest.TestCase):
    Test the validation module
    """

+    _caplog: Optional[pytest.LogCaptureFixture] = None
+
+    @pytest.fixture(autouse=True)
+    def inject_fixtures(self, caplog):
+        self._caplog = caplog
+
    def test_load_4bit_deprecate(self):
        cfg = DictDefault(
            {
@@ -23,6 +31,17 @@ class ValidationTest(unittest.TestCase):
        with pytest.raises(ValueError):
            validate_config(cfg)

+    def test_batch_size_unused_warning(self):
+        cfg = DictDefault(
+            {
+                "batch_size": 32,
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert "batch_size is not recommended" in self._caplog.records[0].message
+
    def test_qlora(self):
        base_cfg = DictDefault(
            {
@@ -146,3 +165,151 @@ class ValidationTest(unittest.TestCase):
        )

        validate_config(cfg)
+
+    def test_falcon_fsdp(self):
+        regex_exp = r".*FSDP is not supported for falcon models.*"
+
+        # Check for lower-case
+        cfg = DictDefault(
+            {
+                "base_model": "tiiuae/falcon-7b",
+                "fsdp": ["full_shard", "auto_wrap"],
+            }
+        )
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)
+
+        # Check for upper-case
+        cfg = DictDefault(
+            {
+                "base_model": "Falcon-7b",
+                "fsdp": ["full_shard", "auto_wrap"],
+            }
+        )
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "base_model": "tiiuae/falcon-7b",
+            }
+        )
+
+        validate_config(cfg)
+
+    def test_mpt_gradient_checkpointing(self):
+        regex_exp = r".*gradient_checkpointing is not supported for MPT models*"
+
+        # Check for lower-case
+        cfg = DictDefault(
+            {
+                "base_model": "mosaicml/mpt-7b",
+                "gradient_checkpointing": True,
+            }
+        )
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)
+
+    def test_flash_optimum(self):
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+                "adapter": "lora",
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "BetterTransformers probably doesn't work with PEFT adapters"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "probably set bfloat16 or float16" in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+                "fp16": True,
+            }
+        )
+        regex_exp = r".*AMP is not supported.*"
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+                "bf16": True,
+            }
+        )
+        regex_exp = r".*AMP is not supported.*"
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)
+
+    def test_adamw_hyperparams(self):
+        cfg = DictDefault(
+            {
+                "optimizer": None,
+                "adam_epsilon": 0.0001,
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "adamw hyperparameters found, but no adamw optimizer set"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "optimizer": "adafactor",
+                "adam_beta1": 0.0001,
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "adamw hyperparameters found, but no adamw optimizer set"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "optimizer": "adamw_bnb_8bit",
+                "adam_beta1": 0.9,
+                "adam_beta2": 0.99,
+                "adam_epsilon": 0.0001,
+            }
+        )
+
+        validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "optimizer": "adafactor",
+            }
+        )
+
+        validate_config(cfg)