pre-commit formatting fixes

Whitespace bug fix
Command had accidentally been moved out of if-else block.
2023-08-05 22:46:02 -04:00 · 2023-08-05 15:08:44 +12:00 · 2023-08-05 11:45:12 +12:00 · 2023-08-05 11:01:44 +12:00 · 2023-08-04 12:12:05 -04:00 · 2023-08-03 16:25:30 -07:00
33 changed files with 1040 additions and 307 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -18,23 +18,13 @@ jobs:
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
-            axolotl_extras:
+            pytorch: 2.0.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.0
-            axolotl_extras:
-          - cuda: "117"
-            cuda_version: 11.7.1
-            python_version: "3.9"
-            pytorch: 1.13.1
-            axolotl_extras:
-          - cuda: "118"
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.0
-            axolotl_extras: gptq
+            pytorch: 2.0.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -58,11 +48,9 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras }}
+            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -17,23 +17,18 @@ jobs:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras: gptq
-          - cuda: cu117
-            cuda_version: 11.7.1
-            python_version: "3.9"
-            pytorch: 1.13.1
-            axolotl_extras:
    runs-on: self-hosted
    steps:
      - name: Checkout
@@ -55,13 +50,11 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
  build-axolotl-runpod:
    needs: build-axolotl
    if: github.repository_owner == 'OpenAccess-AI-Collective'
@@ -69,26 +62,21 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: cu118
+          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
-          - cuda: cu118
+          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras:
-          - cuda: cu118
+          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.0
+            pytorch: 2.0.1
            axolotl_extras: gptq
-          - cuda: cu117
-            cuda_version: 11.7.1
-            python_version: "3.9"
-            pytorch: 1.13.1
-            axolotl_extras:
    runs-on: self-hosted
    steps:
      - name: Checkout
@@ -110,10 +98,9 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-runpod
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
--- a/202
+++ b/202
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -24,11 +24,12 @@
 | mpt      | ✅         | ❌    | ❓     | ❌    | ❓            | ❌                 | ❌          | ❓             |
 | falcon   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
 | gpt-j    | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❓          | ✅             |
+| XGen     | ✅         | ❓    | ✅     | ❓    | ❓            | ❓                 | ❓          | ✅


 ## Quickstart ⚡

-**Requirements**: Python 3.9 and Pytorch 2.0.
+**Requirements**: Python >=3.9 and Pytorch >=2.0.

 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
@@ -36,8 +37,6 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl
 pip3 install -e .
 pip3 install -U git+https://github.com/huggingface/peft.git

-accelerate config
-
 # finetune lora
 accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml

@@ -52,11 +51,10 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \

 - Docker
  ```bash
-  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.9-cu118-2.0.0
+  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
  ```
-  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.0`: for runpod
-  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.0-gptq`: for gptq
-  - `winglian/axolotl:dev`: dev branch (not usually up to date)
+  - `winglian/axolotl-runpod:main-py3.10-cu118-2.0.1`: for runpod
+  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.1-gptq`: for gptq

  Or run on the current files for development:

@@ -108,7 +106,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \

  3. Install torch
  ```bash
-  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+  pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
  ```

  4. Axolotl
@@ -237,7 +235,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts

  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type.
+  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.

 Optionally, download some datasets, see [data/README.md](data/README.md)

@@ -245,7 +243,7 @@ Optionally, download some datasets, see [data/README.md](data/README.md)

 ### Config

-See sample configs in [configs](configs) folder or [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:
+See [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:

 - model
  ```yaml
@@ -255,10 +253,24 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic

 - dataset
  ```yaml
+  sequence_len: 2048 # max token length for prompt
+
+  # huggingface repo
  datasets:
-    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
+    - path: vicgalle/alpaca-gpt4
+      type: alpaca # format from earlier
+
+  # huggingface repo with specific configuration/subset
+  datasets:
+    - path: EleutherAI/pile
+      name: enron_emails
+      type: completion # format from earlier
+
+  # local
+  datasets:
+    - path: json
+      data_files: data.jsonl # or json
      type: alpaca # format from earlier
-  sequence_len: 2048 # max token length / prompt
  ```

 - loading
@@ -297,6 +309,8 @@ base_model_ignore_patterns:
 # if the base_model repo on hf hub doesn't include configuration .json files,
 # you can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
+# you can specify to choose a specific model revision from huggingface hub
+model_revision:
 # Optional tokenizer configuration override in case you want to use a different tokenizer
 # than the one defined in the base model
 tokenizer_config:
@@ -308,6 +322,9 @@ tokenizer_type: AutoTokenizer
 trust_remote_code:
 # use_fast option for tokenizer loading from_pretrained, default to True
 tokenizer_use_fast:
+# resize the model embeddings when new tokens are added to multiples of 32
+# this is reported to improve training speed on some models
+resize_token_embeddings_to_32x:

 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -328,12 +345,13 @@ tf32: true # require >=ampere

 # a list of one or more datasets to finetune the model with
 datasets:
-  # this can be either a hf dataset, or relative path
+  # hf dataset repo | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format OR format:prompt_style (chat/instruct)
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    data_files: # path to source data files
    shards: # number of shards to split data into
+    name: # name of dataset configuration to load

 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
@@ -341,7 +359,7 @@ dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
 # push checkpoints to hub
-push_to_hub_model_id: # repo path
+hub_model_id: # repo path to push finetuned model
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
@@ -403,6 +421,9 @@ logging_steps:
 save_steps:
 eval_steps:

+# save model as safetensors (require safetensors package)
+save_safetensors:
+
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # don't use this, leads to wonky training (according to someone on the internet)
@@ -494,17 +515,6 @@ strict:

 </details>

-### Accelerate
-
-Configure accelerate
-
-```bash
-accelerate config
-
-# Edit manually
-# nano ~/.cache/huggingface/accelerate/default_config.yaml
-```
-
 ### Train

 Run
@@ -512,6 +522,21 @@ Run
 accelerate launch scripts/finetune.py configs/your_config.yml
 ```

+#### Multi-GPU Config
+
+- llama FSDP
+```yaml
+fsdp:
+  - full_shard
+  - auto_wrap
+fsdp_config:
+  fsdp_offload_params: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
+```
+
+- llama Deepspeed: append `ACCELERATE_USE_DEEPSPEED=true` in front of finetune command
+
 ### Inference

 Pass the appropriate flag to the train command:
@@ -562,6 +587,10 @@ Try set `fp16: true`

 Try to turn off xformers.

+> accelerate config missing
+
+It's safe to ignore it.
+
 ## Need help? 🙋♂️

 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -3,16 +3,15 @@ FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG CUDA="118"
+ENV BNB_CUDA_VERSION=$CUDA

 RUN apt-get update && \
    apt-get install -y vim curl

 WORKDIR /workspace

-RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main"
-
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
@@ -22,5 +21,10 @@ RUN cd axolotl && \
        pip install -e .; \
    fi

+# fix so that git fetch/pull from remote works
+RUN cd axolotl && \
+    git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
+    git config --get remote.origin.fetch
+
 # helper for huggingface-login cli
 RUN git config --global credential.helper store
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -8,7 +8,7 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION a
 ENV PATH="/root/miniconda3/bin:${PATH}"

 ARG PYTHON_VERSION="3.9"
-ARG PYTORCH="2.0.0"
+ARG PYTORCH_VERSION="2.0.1"
 ARG CUDA="118"

 ENV PYTHON_VERSION=$PYTHON_VERSION
@@ -29,17 +29,18 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH} torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA


 FROM base-builder AS flash-attn-builder

 WORKDIR /workspace

-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

-RUN git clone https://github.com/HazyResearch/flash-attention.git && \
+RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
    cd flash-attention && \
+    git checkout v2.0.1  && \
    python3 setup.py bdist_wheel && \
    cd csrc/fused_dense_lib && \
    python3 setup.py bdist_wheel && \
@@ -52,7 +53,7 @@ RUN git clone https://github.com/HazyResearch/flash-attention.git && \

 FROM base-builder AS deepspeed-builder

-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

 WORKDIR /workspace

@@ -73,6 +74,9 @@ RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \

 FROM base-builder

+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
 # recompile apex
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
@@ -97,4 +101,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic
+    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/docker/Dockerfile-runpod
+++ b/docker/Dockerfile-runpod
@@ -1,6 +1,10 @@
 ARG BASE_TAG=main
 FROM winglian/axolotl:$BASE_TAG

+ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
+
 COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh

 RUN apt install --yes --no-install-recommends openssh-server tmux && \
--- a/examples/llama-2/README.md
+++ b/examples/llama-2/README.md
@@ -0,0 +1,20 @@
+# Overview
+
+This is an example of a llama-2 configuration for 7b and 13b. The yaml file contains configuration for the 7b variant, but you can just aswell use the same settings for 13b.
+
+The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
+
+The 13b variant will fit if you change these settings to these values:
+gradient_accumulation_steps: 2
+micro_batch_size: 1
+
+```shell
+accelerate launch scripts/finetune.py examples/llama-2/qlora.yml
+
+```
+or
+
+```shell
+accelerate launch scripts/finetune.py examples/llama-2/lora.yml
+
+```
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -0,0 +1,66 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./lora-out
+
+sequence_len: 4096
+max_packed_sequence_len: 4096
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+  pad_token: "<pad>"
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -0,0 +1,67 @@
+base_model: meta-llama/Llama-2-7b-hf
+base_model_config: meta-llama/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./qlora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 4096
+max_packed_sequence_len: 4096
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 3
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
+  pad_token: "<pad>"
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -0,0 +1,90 @@
+# An example finetuning Saleforce's XGen-7b model with 8k context using qlora
+# on Tim Dettmer's Guanaco dataset.
+base_model: Salesforce/xgen-7b-8k-base
+base_model_config: Salesforce/xgen-7b-8k-base
+trust_remote_code: true
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+load_in_8bit: false
+# enable 4bit for QLoRA
+load_in_4bit: true
+gptq: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: timdettmers/openassistant-guanaco
+    data_files:
+      - openassistant_best_replies_train.jsonl
+    type: "completion"
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+# enable QLoRA
+adapter: qlora
+lora_model_dir:
+sequence_len: 8192
+max_packed_sequence_len:
+
+# hyperparameters from QLoRA paper Appendix B.2
+# "We find hyperparameters to be largely robust across datasets"
+lora_r: 64
+lora_alpha: 16
+# 0.1 for models up to 13B
+# 0.05 for 33B and 65B models
+lora_dropout: 0.05
+# add LoRA modules on all linear layers of the base model
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./qlora-out
+
+# QLoRA paper Table 9
+# - 16 for 7b & 13b
+# - 32 for 33b, 64 for 64b
+# Max size tested on A6000
+# - 7b: 40
+# - 40b: 4
+# decrease if OOM, increase for max VRAM utilization
+micro_batch_size: 1
+gradient_accumulation_steps: 1
+num_epochs: 3
+# Optimizer for QLoRA
+optimizer: paged_adamw_32bit
+torchdistx_path:
+lr_scheduler: cosine
+# QLoRA paper Table 9
+# - 2e-4 for 7b & 13b
+# - 1e-4 for 33b & 64b
+learning_rate: 0.00002
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+gradient_checkpointing: true
+# stop training after this many evaluation losses have increased in a row
+# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+early_stopping_patience: 3
+resume_from_checkpoint:
+auto_resume_from_checkpoints: true
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 50
+save_steps: 50
+debug:
+deepspeed:
+weight_decay: 0.0
+special_tokens:
+  eos_token: "<|endoftext|>"
+  bos_token: "<|endoftext|>"
+  unk_token: "<|endoftext|>"
+  pad_token: "<|endoftext|>"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.39.0
-accelerate
+accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
 addict
 fire
 PyYAML==6.0
@@ -12,6 +12,7 @@ wandb
 einops
 xformers
 optimum
+hf_transfer
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
--- a/scripts/alpaca_json_to_jsonl.py
+++ b/scripts/alpaca_json_to_jsonl.py
@@ -15,6 +15,9 @@ from axolotl.convert import (
    JsonToJsonlConverter,
    StdoutWriter,
 )
+from axolotl.logging_config import configure_logging
+
+configure_logging()

 # add src to the pythonpath so we don't need to pip install this
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -17,6 +17,7 @@ import yaml
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer

+from axolotl.logging_config import configure_logging
 from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
@@ -29,9 +30,12 @@ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)

+configure_logging()
+LOG = logging.getLogger("axolotl.scripts")
+

-logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


 def choose_device(cfg):
@@ -212,7 +216,7 @@ def train(

    # load the tokenizer first
    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
-    logging.info(f"loading tokenizer... {tokenizer_config}")
+    LOG.info(f"loading tokenizer... {tokenizer_config}")
    tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)

    if (
@@ -234,7 +238,7 @@ def train(
            eval_dataset = None

    if cfg.debug or "debug" in kwargs:
-        logging.info("check_dataset_labels...")
+        LOG.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
@@ -243,11 +247,11 @@ def train(
        )

    if prepare_ds_only:
-        logging.info("Finished preparing dataset. Exiting...")
+        LOG.info("Finished preparing dataset. Exiting...")
        return

    # Load the model and tokenizer
-    logging.info("loading model and peft_config...")
+    LOG.info("loading model and peft_config...")
    model, peft_config = load_model(
        cfg.base_model,
        cfg.base_model_config,
@@ -258,17 +262,17 @@ def train(
    )

    if "merge_lora" in kwargs and cfg.adapter is not None:
-        logging.info("running merge of LoRA with base model")
+        LOG.info("running merge of LoRA with base model")
        model = model.merge_and_unload()
        model.to(dtype=torch.float16)

        if cfg.local_rank == 0:
-            logging.info("saving merged model")
+            LOG.info("saving merged model")
            model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
        return

    if cfg.inference:
-        logging.info("calling do_inference function")
+        LOG.info("calling do_inference function")
        prompter: Optional[str] = "AlpacaPrompter"
        if "prompter" in kwargs:
            if kwargs["prompter"] == "None":
@@ -287,12 +291,12 @@ def train(
    model.config.use_cache = False

    if torch.__version__ >= "2" and sys.platform != "win32":
-        logging.info("Compiling torch model")
+        LOG.info("Compiling torch model")
        model = torch.compile(model)

    # go ahead and presave, so we have the adapter config available to inspect
    if peft_config:
-        logging.info(f"Pre-saving adapter config to {cfg.output_dir}")
+        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
        peft_config.save_pretrained(cfg.output_dir)

    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
@@ -308,9 +312,9 @@ def train(
            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
        )

-    logging.info("Starting trainer...")
+    LOG.info("Starting trainer...")
    if cfg.group_by_length:
-        logging.info("hang tight... sorting dataset for group_by_length")
+        LOG.info("hang tight... sorting dataset for group_by_length")
    resume_from_checkpoint = cfg.resume_from_checkpoint
    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
        possible_checkpoints = [
@@ -322,7 +326,7 @@ def train(
                key=lambda path: int(path.split("-")[-1]),
            )
            resume_from_checkpoint = sorted_paths[-1]
-            logging.info(
+            LOG.info(
                f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
            )

@@ -336,11 +340,13 @@ def train(
    else:
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)

-    logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
+    LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
-    if cfg.local_rank == 0:
+    if cfg.fsdp:
+        model.save_pretrained(cfg.output_dir)
+    elif cfg.local_rank == 0:
        if cfg.flash_optimum:
            model = BetterTransformer.reverse(model)
        model.save_pretrained(cfg.output_dir)
--- a/scripts/runpod-entrypoint.sh
+++ b/scripts/runpod-entrypoint.sh
@@ -1,10 +1,21 @@
 #!/bin/bash

-echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
-chmod 700 -R ~/.ssh
+# Export specific ENV variables to /etc/rp_environment
+echo "Exporting environment variables..."
+printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
+echo 'source /etc/rp_environment' >> ~/.bashrc

-# Start the SSH service in the background
-service ssh start
+if [[ $PUBLIC_KEY ]]
+then
+    mkdir -p ~/.ssh
+    chmod 700 ~/.ssh
+    echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
+    chmod 700 -R ~/.ssh
+    # Start the SSH service in the background
+    service ssh start
+else
+    echo "No PUBLIC_KEY ENV variable provided, not starting openSSH daemon"
+fi

 # Execute the passed arguments (CMD)
 exec "$@"
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,12 +1,13 @@
 """Module containing Dataset functionality"""

 import logging
+import os
 from typing import List

 import torch
 from datasets import IterableDataset

-from .prompt_tokenizers import InvalidDataException, PromptTokenizingStrategy
+from .prompt_tokenizers import PromptTokenizingStrategy

 # We want this to be a wrapper for an existing dataset that we have loaded
 # lets use the concept of middlewares to wrap each dataset, for example
@@ -14,6 +15,8 @@ from .prompt_tokenizers import InvalidDataException, PromptTokenizingStrategy
 # let's check to ensure we don't truncate an item in the middle, we'll use
 # the collators later on to pad the datasets

+LOG = logging.getLogger("axolotl")
+

 class TokenizedPromptDataset(IterableDataset):
    """
@@ -32,17 +35,15 @@ class TokenizedPromptDataset(IterableDataset):
        self.dataset = dataset

    def __iter__(self):
-        iterator = iter(self.dataset)
-        count = 0
-        # Loop through the entire dataset
-        for example in iterator:
-            try:
-                yield self.prompt_tokenizer.tokenize_prompt(example)
-                count += 1
-            except InvalidDataException:
-                pass
-        if count == 0:
-            raise RuntimeError("Expected at least one datapoint in dataset.")
+        features = self.dataset.features.keys()
+        num_proc = os.cpu_count()
+        return iter(
+            self.dataset.map(
+                self.prompt_tokenizer.tokenize_prompt,
+                num_proc=num_proc,
+                remove_columns=features,
+            )
+        )


 # TODO this isn't the best since it can't interleave datasets
@@ -115,7 +116,7 @@ class ConstantLengthDataset(IterableDataset):
                                "attention_mask": attention_mask,
                            }
                        else:
-                            logging.warning(
+                            LOG.warning(
                                f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
                            )
                    buffer = {
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -0,0 +1,33 @@
+"""Logging configuration settings"""
+
+import os
+import sys
+from logging.config import dictConfig
+from typing import Any, Dict
+
+DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
+    "version": 1,
+    "formatters": {
+        "simple": {
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
+        },
+    },
+    "filters": {},
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "formatter": "simple",
+            "filters": [],
+            "stream": sys.stdout,
+        },
+    },
+    "root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
+    "loggers": {
+        "axolotl": {"handlers": ["console"], "level": "DEBUG", "propagate": False},
+    },
+}
+
+
+def configure_logging():
+    """Configure with default logging"""
+    dictConfig(DEFAULT_LOGGING_CONFIG)
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -8,7 +8,7 @@ import torch
 import transformers
 from einops import rearrange
 from flash_attn.bert_padding import pad_input, unpad_input
-from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb


@@ -79,7 +79,7 @@ def forward(
            dtype=torch.int32,
            device=qkv.device,
        )
-        output = flash_attn_unpadded_qkvpacked_func(
+        output = flash_attn_varlen_qkvpacked_func(
            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
        )
        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
@@ -95,7 +95,7 @@ def forward(
            three=3,
            h=nheads,
        )
-        output_unpad = flash_attn_unpadded_qkvpacked_func(
+        output_unpad = flash_attn_varlen_qkvpacked_func(
            x_unpad,
            cu_q_lens,
            max_s,
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -7,6 +7,7 @@ import math
 from typing import Optional, Tuple

 import torch
+import torch.nn.functional as F
 import transformers.models.llama.modeling_llama
 from torch import nn

@@ -38,21 +39,48 @@ def xformers_forward(
    # pylint: disable=duplicate-code
    bsz, q_len, _ = hidden_states.size()

-    query_states = (
-        self.q_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    key_states = (
-        self.k_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    value_states = (
-        self.v_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
+    if not hasattr(self, "pretraining_tp"):
+        self.pretraining_tp = 1
+
+    if self.pretraining_tp > 1:
+        key_value_slicing = (
+            self.num_key_value_heads * self.head_dim
+        ) // self.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+        query_states = [
+            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        query_states = torch.cat(query_states, dim=-1)
+
+        key_states = [
+            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        key_states = torch.cat(key_states, dim=-1)
+
+        value_states = [
+            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        value_states = torch.cat(value_states, dim=-1)
+
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
@@ -73,6 +101,14 @@ def xformers_forward(

    past_key_value = (key_states, value_states) if use_cache else None

+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = transformers.models.llama.modeling_llama.repeat_kv(
+        key_states, self.num_key_value_groups
+    )
+    value_states = transformers.models.llama.modeling_llama.repeat_kv(
+        value_states, self.num_key_value_groups
+    )
+
    # We only apply xformers optimizations if we don't need to output the whole attention matrix
    if not output_attentions:
        query_states = query_states.transpose(1, 2)
@@ -128,10 +164,23 @@ def xformers_forward(
                f" {attn_output.size()}"
            )

-        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        # end x-formers vs. not x-formers if-else block

    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-    attn_output = self.o_proj(attn_output)
+
+    if self.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(
+            self.hidden_size // self.pretraining_tp, dim=1
+        )
+        attn_output = sum(
+            F.linear(attn_output[i], o_proj_slices[i])
+            for i in range(self.pretraining_tp)
+        )
+    else:
+        attn_output = self.o_proj(attn_output)
+
    return attn_output, attn_weights, past_key_value


@@ -184,14 +233,15 @@ def sdp_attention_forward(

    # We only apply sdp attention if we don't need to output the whole attention matrix
    if not output_attentions:
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            is_causal=False,
-        )
-        attn_weights = None
+        with torch.backends.cuda.sdp_kernel():
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=False,
+            )
+            attn_weights = None
    else:
        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
--- a/src/axolotl/monkeypatch/llama_landmark_attn.py
+++ b/src/axolotl/monkeypatch/llama_landmark_attn.py
@@ -53,7 +53,7 @@ from transformers.utils import (
    replace_return_docstrings,
 )

-logger = logging.get_logger(__name__)
+LOG = logging.getLogger("axolotl")

 _CONFIG_FOR_DOC = "LlamaConfig"

@@ -862,7 +862,7 @@ class LlamaModel(LlamaPreTrainedModel):

        if self.gradient_checkpointing and self.training:
            if use_cache:
-                logger.warning_once(
+                LOG.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -66,15 +66,34 @@ class SystemDataPrompter(AlpacaPrompter):
    ) -> Generator[str, None, None]:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
+        formatted_sys_prompt = f"### System:\n{system}\n\n" if system else ""
        if input:
-            res = system + self.turn_format.format(instruction=instruction, input=input)
+            res = formatted_sys_prompt + self.turn_format.format(
+                instruction=instruction, input=input
+            )
        else:
-            res = system + self.turn_no_input_format.format(instruction=instruction)
+            res = formatted_sys_prompt + self.turn_no_input_format.format(
+                instruction=instruction
+            )
        if output:
            res = f"{res}{output}"
        yield res


+class OpenOrcaSystemDataPrompter(SystemDataPrompter):
+    """
+    Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts
+    """
+
+    def match_prompt_style(self):
+        if self.prompt_style == PromptStyle.INSTRUCT.value:
+            self.turn_format = "### User:\n{instruction}\n\n### Additional Context:\n{input}\n\n### Assistant:\n"
+            self.turn_no_input_format = "### User:\n{instruction}\n\n### Assistant:\n"
+        if self.prompt_style == PromptStyle.CHAT.value:
+            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
+
+
 class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
    """
    Tokenizing strategy for OpenOrca datasets
@@ -113,7 +132,7 @@ def load_chat(tokenizer, cfg):

 def load_open_orca(tokenizer, cfg):
    return OpenOrcaPromptTokenizingStrategy(
-        SystemDataPrompter(PromptStyle.INSTRUCT.value),
+        OpenOrcaSystemDataPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
--- a/src/axolotl/prompt_strategies/pygmalion.py
+++ b/src/axolotl/prompt_strategies/pygmalion.py
@@ -11,6 +11,8 @@ from axolotl.prompt_tokenizers import (
    tokenize_prompt_default,
 )

+LOG = logging.getLogger("axolotl")
+
 IGNORE_TOKEN_ID = -100


@@ -64,7 +66,7 @@ class PygmalionPromptTokenizingStrategy(PromptTokenizingStrategy):
                    *copy.deepcopy(res["input_ids"])
                ][len(self.bot_prefix_token_ids) :]
            else:
-                logging.warning(f"unknown role in conversation: {role}")
+                LOG.warning(f"unknown role in conversation: {role}")
                res = defaultdict(lambda: [])

            # pylint: disable=duplicate-code
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -10,6 +10,8 @@ from transformers import PreTrainedTokenizer

 from axolotl.prompters import IGNORE_TOKEN_ID

+LOG = logging.getLogger("axolotl")
+
 IGNORE_INDEX = -100
 LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"  # nosec
 LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
@@ -46,16 +48,22 @@ class PromptTokenizingStrategy(abc.ABC):

    @functools.lru_cache(maxsize=128)
    def _get_user_token(self):
-        id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
-        if isinstance(id_or_ids, (int,)):
-            return id_or_ids
+        try:
+            id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
+            if isinstance(id_or_ids, (int,)):
+                return id_or_ids
+        except KeyError:
+            pass
        return False

    @functools.lru_cache(maxsize=128)
    def _get_assistant_token(self):
-        id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
-        if isinstance(id_or_ids, (int,)):
-            return id_or_ids
+        try:
+            id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
+            if isinstance(id_or_ids, (int,)):
+                return id_or_ids
+        except KeyError:
+            pass
        return False

    def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
@@ -384,7 +392,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                    else:
-                        logging.warning(f"unhandled role: {part[0]}")
+                        LOG.warning(f"unhandled role: {part[0]}")

                # pylint: disable=duplicate-code
                result, current_len = parse_tokenized_to_result(
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -5,6 +5,7 @@ import logging
 from enum import Enum, auto
 from typing import Generator, List, Optional, Tuple, Union

+LOG = logging.getLogger("axolotl")
 IGNORE_TOKEN_ID = -100


@@ -241,7 +242,7 @@ class Conversation:
            if message:
                yield (role + ":", " " + message)
            else:
-                logging.warning(f"role with empty message: {role}")
+                LOG.warning(f"role with empty message: {role}")
                yield (role + ":", "")

    def copy(self):
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,5 +1,6 @@
 """Module containing data utilities"""
 import functools
+import itertools
 import logging
 from hashlib import md5
 from pathlib import Path
@@ -35,9 +36,11 @@ from axolotl.prompters import (
    SummarizeTLDRPrompter,
 )

+LOG = logging.getLogger("axolotl")
+

 def load_tokenized_prepared_datasets(
-    split, tokenizer, cfg, default_dataset_prepared_path
+    tokenizer, cfg, default_dataset_prepared_path
 ) -> DatasetDict:
    tokenizer_name = tokenizer.__class__.__name__
    ds_hash = str(
@@ -49,8 +52,6 @@ def load_tokenized_prepared_datasets(
                    sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
                )
                + "|"
-                + split
-                + "|"
                + tokenizer_name
            ).encode("utf-8")
        ).hexdigest()
@@ -68,24 +69,24 @@ def load_tokenized_prepared_datasets(
                f"{cfg.push_dataset_to_hub}/{ds_hash}",
                use_auth_token=use_auth_token,
            )
-            dataset = dataset[split]
+            dataset = dataset["train"]
    except Exception:  # pylint: disable=broad-except # nosec
        pass

    if dataset:
        ...
    elif any(prepared_ds_path.glob("*")):
-        logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
        dataset = load_from_disk(str(prepared_ds_path))
-        logging.info("Prepared dataset loaded from disk...")
+        LOG.info("Prepared dataset loaded from disk...")
    else:
-        logging.info(f"Unable to find prepared dataset in {prepared_ds_path}")
-        logging.info("Loading raw datasets...")
+        LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
+        LOG.info("Loading raw datasets...")

        if cfg.seed:
            seed = cfg.seed
        else:
-            logging.info("No seed provided, using default seed of 42")
+            LOG.info("No seed provided, using default seed of 42")
            seed = 42

        datasets = []
@@ -96,6 +97,7 @@ def load_tokenized_prepared_datasets(
            try:
                load_dataset(
                    d.path,
+                    name=d.name,
                    streaming=True,
                    use_auth_token=use_auth_token,
                )
@@ -104,40 +106,51 @@ def load_tokenized_prepared_datasets(
                pass

            # prefer local dataset, even if hub exists
-            if Path(d.path).exists():
-                ds = load_dataset(
-                    "json",
-                    data_files=d.path,
-                    streaming=False,
-                    split=None,
-                )
-            elif ds_from_hub:
-                if d.data_files:
+            local_path = Path(d.path)
+            if local_path.exists():
+                if local_path.is_dir():
                    ds = load_dataset(
                        d.path,
-                        streaming=False,
+                        name=d.name,
                        data_files=d.data_files,
-                        use_auth_token=use_auth_token,
+                        streaming=False,
+                        split=None,
+                    )
+                elif local_path.is_file():
+                    ds = load_dataset(
+                        "json",
+                        name=d.name,
+                        data_files=d.path,
+                        streaming=False,
+                        split=None,
                    )
                else:
-                    ds = load_dataset(
-                        d.path,
-                        streaming=False,
-                        use_auth_token=use_auth_token,
+                    raise ValueError(
+                        "unhandled dataset load: local path exists, but is neither a directory or a file"
                    )
+            elif ds_from_hub:
+                ds = load_dataset(
+                    d.path,
+                    name=d.name,
+                    streaming=False,
+                    data_files=d.data_files,
+                    use_auth_token=use_auth_token,
+                )
            else:
                fp = hf_hub_download(
                    repo_id=d.path,
                    repo_type="dataset",
                    filename=d.data_files,
                )
-                ds = load_dataset("json", data_files=fp, streaming=False, split=None)
+                ds = load_dataset(
+                    "json", name=d.name, data_files=fp, streaming=False, split=None
+                )
            if not ds:
                raise ValueError("unhandled dataset load")
            # support for using a subset of the data
            if d.shards:
-                if split in ds:
-                    ds = ds.shuffle(seed=seed)[split].shard(
+                if "train" in ds:
+                    ds = ds.shuffle(seed=seed)["train"].shard(
                        num_shards=d.shards, index=0
                    )
                else:
@@ -146,8 +159,8 @@ def load_tokenized_prepared_datasets(
            d_type_split = d_type.split(":")
            d_base_type = d_type_split[0]
            d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
-            if split in ds:
-                ds = ds[split]
+            if "train" in ds:
+                ds = ds["train"]
            if ds_strategy := load(d.type, tokenizer, cfg):
                ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
                datasets.append(ds_wrapper)
@@ -245,25 +258,29 @@ def load_tokenized_prepared_datasets(
                suffix = ""
                if ":load_" in d.type:
                    suffix = f" Did you mean {d.type.replace(':load_', '.load_')}?"
-                logging.error(
-                    f"unhandled prompt tokenization strategy: {d.type}. {suffix}"
-                )
+                LOG.error(f"unhandled prompt tokenization strategy: {d.type}. {suffix}")
                raise ValueError(
                    f"unhandled prompt tokenization strategy: {d.type} {suffix}"
                )
-        logging.info("tokenizing, merging, and shuffling master dataset")
+        LOG.info("tokenizing, merging, and shuffling master dataset")

        samples: List[int] = []
+        chunk_size = 1000
        for d in datasets:
-            samples = samples + list(d)
+            d_iter = iter(d)
+            while True:
+                chunk = list(itertools.islice(d_iter, chunk_size))
+                if not chunk:
+                    break
+                samples.extend(chunk)
+
+        LOG.info("shuffle")
        dataset = Dataset.from_list(samples).shuffle(seed=seed)
        if cfg.local_rank == 0:
-            logging.info(
-                f"Saving merged prepared dataset to disk... {prepared_ds_path}"
-            )
+            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
            dataset.save_to_disk(prepared_ds_path)
            if cfg.push_dataset_to_hub:
-                logging.info(
+                LOG.info(
                    f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset.push_to_hub(
@@ -314,63 +331,53 @@ def load_prepare_datasets(
        use_auth_token = cfg.hf_use_auth_token
        try:
            if cfg.push_dataset_to_hub:
-                logging.info(
+                LOG.info(
                    f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset = load_dataset(
                    f"{cfg.push_dataset_to_hub}/{ds_hash}",
                    use_auth_token=use_auth_token,
                )
+                dataset = dataset["train"]
        except Exception:  # pylint: disable=broad-except # nosec
            pass

        if dataset:
            ...
        elif any(prepared_ds_path.glob("*")):
-            logging.info(
+            LOG.info(
                f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
            )
            dataset = load_from_disk(str(prepared_ds_path))
-            logging.info("Prepared packed dataset loaded from disk...")
+            LOG.info("Prepared packed dataset loaded from disk...")
            if cfg.push_dataset_to_hub:
-                logging.info(
+                LOG.info(
                    f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset.push_to_hub(
                    f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
                )
        else:
-            dataset_train = load_tokenized_prepared_datasets(
-                "train", tokenizer, cfg, default_dataset_prepared_path
+            dataset = load_tokenized_prepared_datasets(
+                tokenizer, cfg, default_dataset_prepared_path
            )
-            dataset_test = load_tokenized_prepared_datasets(
-                "test", tokenizer, cfg, default_dataset_prepared_path
-            )
-            dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
+
            if cfg.seed:
                dataset = dataset.shuffle(seed=cfg.seed)

-            constant_len_dataset_train = ConstantLengthDataset(
+            constant_len_dataset = ConstantLengthDataset(
                tokenizer,
-                [dataset["train"]],
+                [dataset],
                seq_length=max_packed_sequence_len,
            )
-            constant_len_dataset_test = ConstantLengthDataset(
-                tokenizer,
-                [dataset["test"]],
-                seq_length=max_packed_sequence_len,
-            )
-            logging.info(
-                f"packing master dataset to len: {cfg.max_packed_sequence_len}"
-            )
-            dataset_train = Dataset.from_list(list(constant_len_dataset_train))
-            dataset_test = Dataset.from_list(list(constant_len_dataset_test))
+            LOG.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
+            dataset = Dataset.from_list(list(constant_len_dataset))

            # filter out bad data
-            dataset_train = Dataset.from_list(
+            dataset = Dataset.from_list(
                [
                    d
-                    for d in dataset_train
+                    for d in dataset
                    if len(d["input_ids"]) < cfg.sequence_len
                    and len(d["input_ids"]) > 0
                    and len(d["input_ids"]) == len(d["attention_mask"])
@@ -378,26 +385,13 @@ def load_prepare_datasets(
                ]
            )

-            # filter out bad data
-            dataset_test = Dataset.from_list(
-                [
-                    d
-                    for d in dataset_test
-                    if len(d["input_ids"]) < cfg.sequence_len
-                    and len(d["input_ids"]) > 0
-                    and len(d["input_ids"]) == len(d["attention_mask"])
-                    and len(d["input_ids"]) == len(d["labels"])
-                ]
-            )
-            dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
-
            if cfg.local_rank == 0:
-                logging.info(
+                LOG.info(
                    f"Saving packed prepared dataset to disk... {prepared_ds_path}"
                )
                dataset.save_to_disk(prepared_ds_path)
                if cfg.push_dataset_to_hub:
-                    logging.info(
+                    LOG.info(
                        f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                    )
                    dataset.push_to_hub(
@@ -405,17 +399,12 @@ def load_prepare_datasets(
                        private=True,
                    )
    else:
-        # dataset_train = load_tokenized_prepared_datasets(
        dataset = load_tokenized_prepared_datasets(
-            "train", tokenizer, cfg, default_dataset_prepared_path
+            tokenizer, cfg, default_dataset_prepared_path
        )
-        # dataset_test = load_tokenized_prepared_datasets(
-        #     "test", tokenizer, cfg, default_dataset_prepared_path
-        # )
-        # dataset = DatasetDict({"train": dataset_train, "test": dataset_test})

    if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
-        logging.info(
+        LOG.info(
            f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
        )
        dataset = dataset.shard(
@@ -427,9 +416,6 @@ def load_prepare_datasets(
        dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
        train_dataset = dataset["train"]
        eval_dataset = dataset["test"]
-    elif "train" in dataset:
-        train_dataset = dataset["train"]
-        eval_dataset = dataset["test"]
    else:
        train_dataset = dataset
        eval_dataset = None
@@ -539,7 +525,7 @@ def encode_pretraining(tokenizer, max_tokens, examples):
        "attention_mask": [seq.tolist() for seq in new_attention_mask],
    }

-    logging.debug(len(ret["input_ids"]))
+    LOG.debug(len(ret["input_ids"]))
    return ret


--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -23,6 +23,8 @@ from transformers import (  # noqa: F401

 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN

+LOG = logging.getLogger("axolotl")
+
 if TYPE_CHECKING:
    from peft import PeftConfig  # noqa: F401

@@ -50,10 +52,10 @@ def load_tokenizer(
            use_fast=use_fast,
        )

-    logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-    logging.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-    logging.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-    logging.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")

    if tokenizer.__class__.__name__ in [
        "LlamaTokenizer",
@@ -90,23 +92,25 @@ def load_model(

    if cfg.is_llama_derived_model and cfg.flash_attention:
        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
-            from axolotl.flash_attn import replace_llama_attn_with_flash_attn
+            from axolotl.monkeypatch.llama_attn_hijack_flash import (
+                replace_llama_attn_with_flash_attn,
+            )

-            logging.info("patching with flash attention")
+            LOG.info("patching with flash attention")
            replace_llama_attn_with_flash_attn()
    elif cfg.is_llama_derived_model and cfg.xformers_attention:
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_attention,
        )

-        logging.info("patching with xformers attention")
+        LOG.info("patching with xformers attention")
        hijack_llama_attention()
    elif cfg.is_llama_derived_model and cfg.sdp_attention:
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_sdp_attention,
        )

-        logging.info("patching with sdp attention")
+        LOG.info("patching with sdp attention")
        hijack_llama_sdp_attention()
    elif cfg.is_llama_derived_model and cfg.landmark_attention:
        from axolotl.monkeypatch.llama_landmark_attn import (
@@ -114,7 +118,7 @@ def load_model(
            patch_llama_with_landmark_attn,
        )

-        logging.info("patching with landmark attention")
+        LOG.info("patching with landmark attention")
        patch_llama_with_landmark_attn()

        # Note: This might overwrite previous additional_special_tokens
@@ -125,7 +129,7 @@ def load_model(
            replace_llama_rope_with_xpos_rope,
        )

-        logging.info("patching with xpos rope")
+        LOG.info("patching with xpos rope")
        replace_llama_rope_with_xpos_rope()

    if cfg.bf16 or cfg.bfloat16:
@@ -142,18 +146,24 @@ def load_model(

            replace_peft_model_with_int4_lora_model()
    except Exception as err:
-        logging.exception(err)
+        LOG.exception(err)
        raise err

-    try:
-        from peft import prepare_model_for_kbit_training
-    except ImportError:
-        # For backward compatibility
-        from peft import (
-            prepare_model_for_int8_training as prepare_model_for_kbit_training,
-        )
+    if not cfg.gptq and (
+        (cfg.adapter == "lora" and load_in_8bit)
+        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
+    ):
+        try:
+            from peft import prepare_model_for_kbit_training
+        except ImportError:
+            # For backward compatibility
+            from peft import (
+                prepare_model_for_int8_training as prepare_model_for_kbit_training,
+            )

    model_kwargs = {}
+    if cfg.model_revision:
+        model_kwargs["revision"] = cfg.model_revision
    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -185,7 +195,7 @@ def load_model(
                if len(files) > 0:
                    model_path = str(files[0])
                else:
-                    logging.warning(
+                    LOG.warning(
                        "unable to find a cached model file, this will likely fail..."
                    )
                    model_path = str(cache_model_path)
@@ -202,7 +212,7 @@ def load_model(
                else True,
            )
            load_in_8bit = False
-        elif cfg.is_llama_derived_model:
+        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
            from transformers import LlamaForCausalLM

            config = LlamaConfig.from_pretrained(base_model_config)
@@ -241,7 +251,7 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type:
+        elif model_type and not cfg.trust_remote_code:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
@@ -264,14 +274,14 @@ def load_model(
                and cfg.sequence_len > config.max_seq_len
            ):
                config.max_seq_len = cfg.sequence_len
-                logging.warning(f"increasing context length to {cfg.sequence_len}")
+                LOG.warning(f"increasing context length to {cfg.sequence_len}")
            elif (
                hasattr(config, "max_sequence_length")
                and config.max_sequence_length
                and cfg.sequence_len > config.max_sequence_length
            ):
                config.max_sequence_length = cfg.sequence_len
-                logging.warning(f"increasing context length to {cfg.sequence_len}")
+                LOG.warning(f"increasing context length to {cfg.sequence_len}")
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                config=config,
@@ -283,10 +293,10 @@ def load_model(
                **model_kwargs,
            )
    except Exception as err:  # pylint: disable=broad-exception-caught
-        logging.error(
+        LOG.error(
            "Exception raised attempting to load model, retrying with AutoModelForCausalLM"
        )
-        logging.exception(err)
+        LOG.exception(err)
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
@@ -297,7 +307,11 @@ def load_model(
            **model_kwargs,
        )

-    embeddings_len = math.ceil(len(tokenizer) / 32) * 32
+    embeddings_len = (
+        math.ceil(len(tokenizer) / 32) * 32
+        if cfg.resize_token_embeddings_to_32x
+        else len(tokenizer)
+    )
    model.resize_token_embeddings(embeddings_len)

    if (
@@ -305,7 +319,7 @@ def load_model(
        and model.config.max_position_embeddings
        and cfg.sequence_len >= model.config.max_position_embeddings
    ):
-        logging.warning(
+        LOG.warning(
            f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
        )
        model.config.max_position_embeddings = cfg.sequence_len
@@ -314,11 +328,21 @@ def load_model(
        (cfg.adapter == "lora" and load_in_8bit)
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
    ):
-        logging.info("converting PEFT model w/ prepare_model_for_kbit_training")
+        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=cfg.gradient_checkpointing
        )

+        # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
+        # convert them back to fp16/bf16 for flash-attn compatibility.
+        if cfg.flash_attention and cfg.is_llama_derived_model:
+            for name, module in model.named_modules():
+                if "norm" in name:
+                    module.to(torch_dtype)
+                if "lm_head" in name or "embed_tokens" in name:
+                    if hasattr(module, "weight"):
+                        module.to(torch_dtype)
+
    model, lora_config = load_adapter(model, cfg, adapter)

    if cfg.ddp and not load_in_8bit:
@@ -326,7 +350,7 @@ def load_model(

    if cfg.gptq:
        # Scales to half
-        logging.info("Fitting 4bit scales and zeros to half")
+        LOG.info("Fitting 4bit scales and zeros to half")
        for _, module in model.named_modules():
            if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
                type(module)
@@ -352,7 +376,7 @@ def load_model(
        if param.requires_grad:
            requires_grad.append(f"{name}: {param.requires_grad}")
    if len(requires_grad) == 0:
-        logging.warning("there are no parameters that require gradient updates")
+        LOG.warning("there are no parameters that require gradient updates")
    model.config.use_cache = False

    if cfg.flash_optimum:
@@ -386,7 +410,7 @@ def load_llama_adapter(model, cfg):
    )

    if cfg.lora_model_dir:
-        logging.info("Loading pretained LORA")
+        LOG.info("Loading pretained LORA")
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
@@ -433,7 +457,7 @@ def load_lora(model, cfg):
            bits = 8

        linear_names = find_all_linear_names(bits, model)
-        logging.info(f"found linear modules: {repr(linear_names)}")
+        LOG.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))

    lora_config = LoraConfig(
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -1,6 +1,9 @@
 """Module for custom LRScheduler class"""
+import math
+from functools import partial

-from torch.optim.lr_scheduler import LRScheduler
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR, LRScheduler


 class InterpolatingLogScheduler(LRScheduler):
@@ -42,3 +45,58 @@ class InterpolatingLogScheduler(LRScheduler):
            lrs = [self.max_lr for base_lr in self.base_lrs]

        return lrs
+
+
+def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
+    current_step: int,
+    *,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float
+):
+    if current_step < num_warmup_steps:
+        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
+    progress = float(current_step - num_warmup_steps) / float(
+        max(1, num_training_steps - num_warmup_steps)
+    )
+    return max(
+        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
+    )
+
+
+def get_cosine_schedule_with_quadratic_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+):
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        num_cycles (`float`, *optional*, defaults to 0.5):
+            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
+            following a half-cosine).
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    lr_lambda = partial(
+        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        num_cycles=num_cycles,
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -5,6 +5,8 @@ import logging

 from termcolor import colored

+LOG = logging.getLogger("axolotl")
+

 def check_dataset_labels(dataset, tokenizer):
    # the dataset is already shuffled, so let's just check the first 5 elements
@@ -32,7 +34,7 @@ def check_example_labels(example, tokenizer):
        )
        colored_tokens.append(colored_token)

-    logging.info(" ".join(colored_tokens))
-    logging.info("\n\n\n")
+    LOG.info(" ".join(colored_tokens))
+    LOG.info("\n\n\n")

    return " ".join(colored_tokens)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -5,6 +5,7 @@ import logging
 import math
 import os
 import sys
+from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional

@@ -13,17 +14,70 @@ import torch.cuda
 import transformers
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
-from transformers import EarlyStoppingCallback, Trainer
+from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
 from transformers.trainer_pt_utils import get_parameter_names

 from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    SavePeftModelCallback,
 )
-from axolotl.utils.schedulers import InterpolatingLogScheduler
+from axolotl.utils.schedulers import (
+    InterpolatingLogScheduler,
+    get_cosine_schedule_with_quadratic_warmup,
+)
+
+LOG = logging.getLogger("axolotl")


-class OneCycleLRSchedulerTrainer(Trainer):
+@dataclass
+class AxolotlTrainingArguments(TrainingArguments):
+    """
+    Extend the base TrainingArguments for axolotl helpers
+    """
+
+    lr_quadratic_warmup: bool = field(
+        default=False,
+        metadata={"help": "Use quadratic warmup for cosine scheduling."},
+    )
+
+
+class AxolotlTrainer(Trainer):
+    """
+    Extend the base Trainer for axolotl helpers
+    """
+
+    args = None  # type: AxolotlTrainingArguments
+
+    def create_scheduler(
+        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
+    ):
+        """
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        passed as an argument.
+
+        Args:
+            num_training_steps (int): The number of training steps to do.
+            optimizer (torch.optim.Optimizer): The training optimizer
+        """
+
+        # fmt: off
+        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
+            # fmt: on
+            if (
+                self.args.lr_scheduler_type == "cosine"
+                and self.args.lr_quadratic_warmup is True
+            ):
+                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
+                    optimizer,
+                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
+                    num_training_steps=num_training_steps,
+                )
+            else:
+                return super().create_scheduler(num_training_steps, optimizer)
+        return self.lr_scheduler
+
+
+class OneCycleLRSchedulerTrainer(AxolotlTrainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
    """
@@ -103,6 +157,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.fsdp_config:
            training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)

+    if cfg.lr_quadratic_warmup is not None:
+        training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
+
    # deepspeed
    if (
        os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -124,11 +181,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    if cfg.max_grad_norm:
        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm

-    if cfg.push_to_hub_model_id:
-        training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id
+    if cfg.hub_model_id:
+        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
        training_arguments_kwargs["push_to_hub"] = True
+        training_arguments_kwargs["hub_private_repo"] = True

-    training_args = transformers.TrainingArguments(
+    if cfg.save_safetensors:
+        training_arguments_kwargs["save_safetensors"] = cfg.save_safetensors
+
+    training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -137,9 +198,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        eval_accumulation_steps=cfg.gradient_accumulation_steps,
        num_train_epochs=cfg.num_epochs,
        learning_rate=cfg.learning_rate,
-        evaluation_strategy="steps",
+        evaluation_strategy="steps" if cfg.val_set_size > 0 else "no",
        save_strategy="steps" if cfg.save_steps else "epoch",
-        eval_steps=cfg.eval_steps,
+        eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None,
        save_steps=cfg.save_steps,
        output_dir=cfg.output_dir,
        save_total_limit=3,
@@ -266,7 +327,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):

        set_model_mem_id(model, tokenizer)

-        logging.info("Adding landmark attention tokens to dataset")
+        LOG.info("Adding landmark attention tokens to dataset")

        for dataset in [train_dataset, eval_dataset]:
            dataset = dataset.map(
@@ -278,7 +339,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    trainer_cls = (
        OneCycleLRSchedulerTrainer
        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else transformers.Trainer
+        else AxolotlTrainer
    )
    trainer = trainer_cls(
        model=model,
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -4,6 +4,8 @@ import logging

 import torch

+LOG = logging.getLogger("axolotl")
+

 def validate_config(cfg):
    if cfg.gradient_accumulation_steps and cfg.batch_size:
@@ -11,7 +13,7 @@ def validate_config(cfg):
            "please set only one of gradient_accumulation_steps or batch_size"
        )
    if cfg.batch_size:
-        logging.warning(
+        LOG.warning(
            "%s\n%s",
            "batch_size is not recommended. Please use gradient_accumulation_steps instead.",
            "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
@@ -44,10 +46,10 @@ def validate_config(cfg):
                raise ValueError("Require cfg.load_in_4bit to be True for qlora")

    if not cfg.load_in_8bit and cfg.adapter == "lora":
-        logging.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
+        LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")

    if cfg.trust_remote_code:
-        logging.warning(
+        LOG.warning(
            "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
        )

@@ -66,31 +68,34 @@ def validate_config(cfg):

    if cfg.flash_optimum is True:
        if cfg.adapter:
-            logging.warning(
-                "BetterTransformers probably doesn't work with PEFT adapters"
-            )
+            LOG.warning("BetterTransformers probably doesn't work with PEFT adapters")
        if cfg.fp16 or cfg.bf16:
            raise ValueError("AMP is not supported with BetterTransformer")
        if cfg.float16 is not True and cfg.bloat16 is not True:
-            logging.warning(
+            LOG.warning(
                "You should probably set bfloat16 or float16 to true to "
                "load the model in float16 for BetterTransformers"
            )
        if int(torch.__version__.split(".")[0]) < 2:
-            logging.warning("torch>=2.0.0 required")
+            LOG.warning("torch>=2.0.0 required")
            raise ValueError(
                f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
            )

    if cfg.pretraining_dataset and cfg.group_by_length:
-        logging.warning(
+        LOG.warning(
            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
        )

-    if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
+    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
        not cfg.optimizer or "adamw" not in cfg.optimizer
    ):
-        logging.warning("adamw hyperparameters found, but no adamw optimizer set")
+        LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
+
+    if cfg.push_to_hub_model_id:
+        raise ValueError(
+            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
+        )

    # TODO
    # MPT 7b
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -17,7 +17,7 @@ from axolotl.prompt_tokenizers import (
 )
 from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter

-logging.basicConfig(level="INFO")
+LOG = logging.getLogger("axolotl")


 class TestPromptTokenizationStrategies(unittest.TestCase):
@@ -130,8 +130,9 @@ class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
            "output": "Hi! How can I help?",
        }
        example = strat.tokenize_prompt(sample)
-        assert example["input_ids"][0:3] == [1, 671, 20118]  # <s>use cot
-        assert example["input_ids"][3] == 11889  # USER
+        assert example["input_ids"][0:4] == [1, 835, 2184, 29901]  # "<s>### System:"
+        assert example["input_ids"][5:7] == [1509, 20118]  # "use cot"
+        assert example["input_ids"][9] == 11889  # USER


 if __name__ == "__main__":
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -70,7 +70,7 @@ class AlpacaPrompterTest(unittest.TestCase):
            )
        )
        assert "use cot" in res
-        assert res.startswith("use cot")
+        assert res.startswith("### System:")
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "alpacas" in res
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": None,
-                "adamw_epsilon": 0.0001,
+                "adam_epsilon": 0.0001,
            }
        )

@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adafactor",
-                "adamw_beta1": 0.0001,
+                "adam_beta1": 0.0001,
            }
        )

@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adamw_bnb_8bit",
-                "adamw_beta1": 0.0001,
-                "adamw_beta2": 0.0001,
-                "adamw_epsilon": 0.0001,
+                "adam_beta1": 0.9,
+                "adam_beta2": 0.99,
+                "adam_epsilon": 0.0001,
            }
        )
Author	SHA1	Message	Date
Wing Lian	9793faf6dc	pre-commit formatting fixes Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-08-05 22:46:02 -04:00
ssmi153	64852ae15a	Whitespace bug fix Command had accidentally been moved out of if-else block.	2023-08-05 15:08:44 +12:00
ssmi153	1fed74b1d9	Catch configs without pretraining_tp	2023-08-05 11:45:12 +12:00
ssmi153	a300a4db1d	Fix XFormers attention for Llama-2 70B (GQA) Updated XFormers MonkeyPatch to handle GQA as used in Llama-2 70B. All the updated code is taken directly from the Transformers library: `07360b6c9c (diff-06392bad3b9e97be9ade60d4ac46f73b6809388f4d507c2ba1384ab872711c51)` from their llama_modeling.py file.	2023-08-05 11:01:44 +12:00
Wing Lian	fe285430bc	optimize the iteration when tokenizeing large datasets (#332 )	2023-08-04 12:12:05 -04:00
Aman Gupta Karmani	0d2e34f056	Merge pull request #336 from tmm1/flash-attn Fix flash-attn + qlora not working with llama models	2023-08-03 16:25:30 -07:00
Aman Gupta Karmani	b56a6c0101	Merge pull request #337 from tmm1/readme-fix update README	2023-08-03 15:14:17 -07:00
Aman Karmani	2eda9e02a9	fix typo	2023-08-03 21:04:12 +00:00
Aman Karmani	78b9efb7f4	scope flash-attn+qlora fix correctly, scope to llama, add comment	2023-08-03 19:19:39 +00:00
Aman Karmani	312a9fad07	move flash-attn monkey patch alongside the others	2023-08-03 17:20:49 +00:00
Aman Karmani	58d665943e	python 3.10 and 3.11 both work fine, as does pytorch 2.1.0.dev	2023-08-03 16:47:25 +00:00
Aman Karmani	cc7e80026e	there is no configs folder	2023-08-03 16:31:37 +00:00
mhenrichsen	dc71d8872a	feat/llama-2 examples (#319 ) * qlora llama-2 * qlora llama-2 * linting * readme * lora added * linting * change group_by_length * 13b fitting on 24gb * grouped lengths true * add pad token * change out dir --------- Co-authored-by: Mads Henrichsen <mads@Brbar-tilhrende-Mads.local>	2023-08-03 19:22:48 +09:00
Aman Karmani	248bf90f89	ensure flash-attn fixes happen in both adapter/lora modes, and use torch_dtype	2023-08-02 20:15:03 +00:00
Wing Lian	77085ea24e	qlora w flash attention fixes (#333 )	2023-08-01 23:26:16 -04:00
Wing Lian	db2a3586f3	add peft install back since it doesn't get installed by setup.py (#331 )	2023-07-31 16:31:53 -04:00
Wing Lian	6c9a87c8ee	pin accelerate so it works with llama2 (#330 )	2023-07-30 22:20:06 -04:00
Wing Lian	894cba09f3	fix FSDP save of final model (#329 )	2023-07-30 21:46:44 -04:00
Wing Lian	41a4d15d43	update README for updated docker images (#328 ) * update README for updated docker images * update readme from pr feedback	2023-07-28 16:50:03 -04:00
Wing Lian	2c37bf6c21	Prune cuda117 (#327 ) * drop cuda117/torch 1.13.1 from support, pin flash attention to v2.0.1, rm torchvision/torchaudio install * gptq base build not needed. add sm 9.0 support	2023-07-26 16:27:49 -04:00
Wing Lian	9f69c4d8c1	latest HEAD of accelerate causes 0 loss immediately w FSDP (#321 )	2023-07-24 11:23:56 -04:00
Wing Lian	3d4984b9a5	update prompts for open orca to match the paper (#317 ) fix the test for the updated system tokenizer	2023-07-22 13:49:11 -04:00
Wing Lian	ff7f18d1ed	disable gh cache for first step of docker builds too	2023-07-22 11:46:37 -04:00
Wing Lian	cf62cfd661	add runpod envs to .bashrc, fix bnb env (#316 ) * hopper support for base dockerfile, add runpod envs to .bashrc * set BNB_CUDA_VERSION env for latest bnb * don't support hopper yet w 118	2023-07-22 10:09:38 -04:00
Wing Lian	c5df969262	don't use the gha cache w docker	2023-07-22 08:46:21 -04:00
Wing Lian	40a53ff181	Merge pull request #307 from OpenAccess-AI-Collective/xgen-user-sharegpt-tokens better handling since xgen tokenizer breaks with convert_tokens_to_ids	2023-07-22 04:10:38 -04:00
Wing Lian	dcdec44347	Merge pull request #306 from ethanhs/xgen Add XGen info to README and example config	2023-07-22 04:10:18 -04:00
Wing Lian	3ffb018a4c	Merge pull request #313 from OpenAccess-AI-Collective/tokenizer-llama2-embeddings don't resize embeddings to multiples of 32x by default	2023-07-22 04:09:59 -04:00
Wing Lian	a94f2eecb1	Merge pull request #299 from OpenAccess-AI-Collective/flash-attention-2 Flash attention 2	2023-07-22 04:07:48 -04:00
Wing Lian	1066751358	don't resize embeddings to multiples of 32x by default	2023-07-22 01:52:38 -04:00
Wing Lian	1b63bf13bc	Merge pull request #308 from OpenAccess-AI-Collective/apache2-license add apache 2.0 license	2023-07-21 09:50:14 -04:00
Wing Lian	5cce2a42ff	add apache 2.0 license	2023-07-21 09:49:29 -04:00
Wing Lian	2a428e8014	better handling since xgen tokenizer breaks with convert_tokens_to_ids	2023-07-21 09:24:11 -04:00
Wing Lian	cdf85fdbd5	pin flash attention 2 to the fix for backwards pass	2023-07-21 08:18:53 -04:00
Wing Lian	9b790d359b	flash attention 2	2023-07-21 08:17:46 -04:00
Ethan Smith	38811434e6	Add XGen info to README and example config	2023-07-21 00:44:50 -07:00
NanoCode012	06c61d6f13	Merge pull request #304 from OpenAccess-AI-Collective/NanoCode012-patch-1 Fix(readme): Improve wording for push model	2023-07-21 13:39:45 +09:00
Wing Lian	262dc29df2	Merge pull request #300 from OpenAccess-AI-Collective/pytorch-201 Pytorch 2.0.1	2023-07-21 00:28:38 -04:00
NanoCode012	165907fddb	Fix(readme): Improve wording for push model	2023-07-21 11:28:35 +09:00
Wing Lian	a032c9f452	fix sdp attention to use the flash/mem-efficient context manaager	2023-07-20 01:05:48 -04:00
Wing Lian	b06d3e3645	explicitly pin flash attention 1 to v1.0.9	2023-07-20 01:02:08 -04:00
Wing Lian	c58034d48c	use pytorch 2.0.1	2023-07-20 00:47:13 -04:00
NanoCode012	28fd429bcf	Merge pull request #293 from NanoCode012/fix/tokenize-speed Fix(tokenizing): Use multi-core	2023-07-19 11:02:04 +09:00
NanoCode012	45ac7c4f88	feat: use multi-core	2023-07-19 10:16:54 +09:00
Wing Lian	edd6980dd9	Merge pull request #289 from OpenAccess-AI-Collective/hf_transfer add hf_transfer to requirements for faster hf upload	2023-07-17 15:08:06 -04:00
Wing Lian	dc6d25124d	Merge pull request #288 from OpenAccess-AI-Collective/NanoCode012-patch-1 fix(readme): remove accelerate config	2023-07-17 14:46:43 -04:00
Wing Lian	6dd2e7d671	add hf_transfer to requirements for faster hf upload	2023-07-17 14:44:48 -04:00
NanoCode012	b64f411849	fix(readme): remove accelerate config	2023-07-18 01:31:02 +09:00
Wing Lian	03a59c1ed4	Merge pull request #287 from OpenAccess-AI-Collective/dataclass-fix fix axolotl training args dataclass annotation	2023-07-17 06:09:23 -04:00
Wing Lian	ebaec3c406	fix axolotl training args dataclass annotation	2023-07-17 04:57:02 -04:00
Wing Lian	73e70e3996	Merge pull request #286 from OpenAccess-AI-Collective/logging-docker-fixes misc fixes	2023-07-17 04:26:39 -04:00
Wing Lian	d75adb9835	misc fixes	2023-07-17 03:00:27 -04:00
Wing Lian	02224668c3	Merge pull request #283 from OpenAccess-AI-Collective/docker-git-fetch git fetch fix for docker	2023-07-17 02:17:00 -04:00
Wing Lian	f162f3c7cc	set transformers cache env var in docker image	2023-07-16 23:03:54 -04:00
Wing Lian	eca3531329	git fetch fix for docker	2023-07-16 22:25:05 -04:00
Wing Lian	6f16c4569d	Merge pull request #276 from theobjectivedad/logging_enhancement Logging update: added PID and formatting	2023-07-16 17:04:52 -04:00
Wing Lian	0bd09c077d	Merge pull request #280 from teknium1/main Update requirements.txt	2023-07-16 16:08:58 -04:00
Wing Lian	469c08c9ba	Merge pull request #279 from NanoCode012/feat/multi-gpu-readme Feat(readme): improve docs on multi-gpu	2023-07-16 16:08:37 -04:00
Wing Lian	334af625d0	Merge pull request #277 from cg123/dataset-name Allow non-default dataset configurations	2023-07-16 16:08:15 -04:00
Teknium	273b3a3aa7	Update requirements.txt Require latest git accelerate to fix saving checkpoint issue	2023-07-16 10:24:24 -07:00
Charles Goddard	3cdd8e4122	Add dataset name to all yaml options in README	2023-07-15 13:17:37 -07:00
NanoCode012	cf5ae6b649	Feat(readme): improve docs on multi-gpu	2023-07-16 01:07:27 +09:00
theobjectivedad	b1f4f7a34d	Fixed pre-commit problems, fixed small bug in logging_config to handle LOG_LEVEL env var	2023-07-15 12:29:35 +00:00
The Objective Dad	83237b8445	Merge branch 'OpenAccess-AI-Collective:main' into logging_enhancement	2023-07-15 06:16:04 -05:00
Charles Goddard	46032a1a1f	Fix formatting mistake	2023-07-14 20:57:27 -07:00
Charles Goddard	8bba64258e	Add example of dataset with configuration name to README	2023-07-14 20:46:21 -07:00
Charles Goddard	88089e8b32	Add ability to pass 'name' argument to load_dataset	2023-07-14 16:46:39 -07:00
NanoCode012	168a7a09cc	Merge pull request #274 from OpenAccess-AI-Collective/NanoCode012-patch-2 Feat: Set push to hub as private by default	2023-07-14 23:15:47 +09:00
NanoCode012	231031a0e1	Merge pull request #275 from NanoCode012/feat/safetensors Feat: Add save_safetensors	2023-07-14 23:07:26 +09:00
theobjectivedad	9234b75cb4	Update log message format, IMO this is easier to read.	2023-07-14 07:36:21 -05:00
theobjectivedad	553a86b52c	Adding logging enhancement	2023-07-14 07:26:19 -05:00
NanoCode012	5daf7d5299	Merge pull request #273 from OpenAccess-AI-Collective/NanoCode012-patch-1 Feat(docs): Add model_revision arg	2023-07-14 21:09:50 +09:00
NanoCode012	5491278a79	Feat: Add save_safetensors	2023-07-14 13:21:47 +09:00
NanoCode012	1514739f0f	Set push to hub as private by default	2023-07-14 13:17:49 +09:00
NanoCode012	896c1aebcf	Feat(docs): Add model_revision arg	2023-07-14 12:56:07 +09:00
Wing Lian	ef17e15483	Merge pull request #272 from OpenAccess-AI-Collective/model-revision support for loading a model by git revision	2023-07-13 23:12:00 -04:00
Wing Lian	69a235061b	support for loading a model by git revision	2023-07-13 22:58:25 -04:00
Wing Lian	687d889928	Merge pull request #271 from OpenAccess-AI-Collective/quadratic-warmup Quadratic warmup	2023-07-10 12:48:02 -04:00
Wing Lian	c4cf567b55	Merge branch 'main' into quadratic-warmup	2023-07-10 12:42:12 -04:00
Wing Lian	c49729d2bc	better configuration for quadratic warmup	2023-07-10 11:52:59 -04:00
Wing Lian	13ac4d8de2	Merge pull request #268 from OpenAccess-AI-Collective/fix-adam-args params are adam_, not adamw_	2023-07-08 12:33:34 -04:00
Wing Lian	19cf0bda99	params are adam_, not adamw_	2023-07-08 12:13:39 -04:00
Wing Lian	f74edd5b56	Merge pull request #266 from OpenAccess-AI-Collective/trust-remote-no-llama	2023-07-07 21:38:11 -04:00
Wing Lian	d69da99c2c	skip explicit model type too if using trust_remote_code	2023-07-07 21:33:11 -04:00
Wing Lian	66afb76a15	don't use llama if trust_remote_code is set since that needs to use AutoModel path	2023-07-07 21:31:02 -04:00
NanoCode012	a692ad3f4c	Merge pull request #264 from OpenAccess-AI-Collective/NanoCode012-patch-1 Fix(readme): local path loading and custom strategy type	2023-07-06 23:34:57 +09:00
NanoCode012	41da98b982	Fix for linter	2023-07-06 23:20:11 +09:00
NanoCode012	9e64f42e0f	Fix local path loading and custom strategy type	2023-07-06 23:08:09 +09:00
Wing Lian	b9b7d4ce92	Merge pull request #221 from utensil/local_dataset [WIP] Support loading data files from a local directory	2023-07-03 09:10:13 -04:00
Wing Lian	9bed281867	Merge pull request #258 from NanoCode012/fix/deprecate-push Fix future deprecation push_to_hub_model_id	2023-07-03 09:08:26 -04:00
NanoCode012	e79c8e617e	Fix future deprecation push_to_hub_model_id	2023-07-03 12:44:29 +09:00
Wing Lian	71456955f5	pin pydantic so deepspeed isn't broken	2023-07-02 22:26:51 -04:00
Utensil	9bdd30cdfd	Support loading data files from a local directory ref: https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path	2023-06-21 08:00:58 +00:00
Wing Lian	7dc580b837	add axolotl trainer and quadratic warmup	2023-06-12 13:16:40 -04:00