tweak to make it work when we have no explicit test split

make sure we eval for openorca
handle orca splits
2023-07-11 22:40:21 -04:00 · 2023-07-02 17:59:10 -04:00 · 2023-07-01 07:20:23 -04:00
33 changed files with 307 additions and 1040 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -18,13 +18,23 @@ jobs:
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            pytorch: 2.0.0
+            axolotl_extras:
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+            pytorch: 2.0.0
+            axolotl_extras:
+          - cuda: "117"
+            cuda_version: 11.7.1
+            python_version: "3.9"
+            pytorch: 1.13.1
+            axolotl_extras:
+          - cuda: "118"
+            cuda_version: 11.8.0
+            python_version: "3.9"
+            pytorch: 2.0.0
+            axolotl_extras: gptq
    steps:
      - name: Checkout
        uses: actions/checkout@v3
@@ -48,9 +58,11 @@ jobs:
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
-            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
+            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -17,18 +17,23 @@ jobs:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.1
+            pytorch: 2.0.0
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
+            pytorch: 2.0.0
            axolotl_extras:
          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.1
+            pytorch: 2.0.0
            axolotl_extras: gptq
+          - cuda: cu117
+            cuda_version: 11.7.1
+            python_version: "3.9"
+            pytorch: 1.13.1
+            axolotl_extras:
    runs-on: self-hosted
    steps:
      - name: Checkout
@@ -50,11 +55,13 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
  build-axolotl-runpod:
    needs: build-axolotl
    if: github.repository_owner == 'OpenAccess-AI-Collective'
@@ -62,21 +69,26 @@ jobs:
    strategy:
      matrix:
        include:
-          - cuda: 118
+          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.1
+            pytorch: 2.0.0
            axolotl_extras:
-          - cuda: 118
+          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
+            pytorch: 2.0.0
            axolotl_extras:
-          - cuda: 118
+          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
-            pytorch: 2.0.1
+            pytorch: 2.0.0
            axolotl_extras: gptq
+          - cuda: cu117
+            cuda_version: 11.7.1
+            python_version: "3.9"
+            pytorch: 1.13.1
+            axolotl_extras:
    runs-on: self-hosted
    steps:
      - name: Checkout
@@ -98,9 +110,10 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            CUDA=${{ matrix.cuda }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          file: ./docker/Dockerfile-runpod
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
--- a/202
+++ b/202
@@ -1,202 +0,0 @@
-
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -24,12 +24,11 @@
 | mpt      | ✅         | ❌    | ❓     | ❌    | ❓            | ❌                 | ❌          | ❓             |
 | falcon   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
 | gpt-j    | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❓          | ✅             |
-| XGen     | ✅         | ❓    | ✅     | ❓    | ❓            | ❓                 | ❓          | ✅


 ## Quickstart ⚡

-**Requirements**: Python >=3.9 and Pytorch >=2.0.
+**Requirements**: Python 3.9 and Pytorch 2.0.

 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
@@ -37,6 +36,8 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl
 pip3 install -e .
 pip3 install -U git+https://github.com/huggingface/peft.git

+accelerate config
+
 # finetune lora
 accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml

@@ -51,10 +52,11 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \

 - Docker
  ```bash
-  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.10-cu118-2.0.1
+  docker run --gpus '"all"' --rm -it winglian/axolotl:main-py3.9-cu118-2.0.0
  ```
-  - `winglian/axolotl-runpod:main-py3.10-cu118-2.0.1`: for runpod
-  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.1-gptq`: for gptq
+  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.0`: for runpod
+  - `winglian/axolotl-runpod:main-py3.9-cu118-2.0.0-gptq`: for gptq
+  - `winglian/axolotl:dev`: dev branch (not usually up to date)

  Or run on the current files for development:

@@ -106,7 +108,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \

  3. Install torch
  ```bash
-  pip3 install -U torch --index-url https://download.pytorch.org/whl/cu118
+  pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
  ```

  4. Axolotl
@@ -235,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts

  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
+  2. Use your custom file name as the dataset type.

 Optionally, download some datasets, see [data/README.md](data/README.md)

@@ -243,7 +245,7 @@ Optionally, download some datasets, see [data/README.md](data/README.md)

 ### Config

-See [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:
+See sample configs in [configs](configs) folder or [examples](examples) for quick start. It is recommended to duplicate and modify to your needs. The most important options are:

 - model
  ```yaml
@@ -253,24 +255,10 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod

 - dataset
  ```yaml
-  sequence_len: 2048 # max token length for prompt
-
-  # huggingface repo
  datasets:
-    - path: vicgalle/alpaca-gpt4
-      type: alpaca # format from earlier
-
-  # huggingface repo with specific configuration/subset
-  datasets:
-    - path: EleutherAI/pile
-      name: enron_emails
-      type: completion # format from earlier
-
-  # local
-  datasets:
-    - path: json
-      data_files: data.jsonl # or json
+    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
      type: alpaca # format from earlier
+  sequence_len: 2048 # max token length / prompt
  ```

 - loading
@@ -309,8 +297,6 @@ base_model_ignore_patterns:
 # if the base_model repo on hf hub doesn't include configuration .json files,
 # you can set that here, or leave this empty to default to base_model
 base_model_config: ./llama-7b-hf
-# you can specify to choose a specific model revision from huggingface hub
-model_revision:
 # Optional tokenizer configuration override in case you want to use a different tokenizer
 # than the one defined in the base model
 tokenizer_config:
@@ -322,9 +308,6 @@ tokenizer_type: AutoTokenizer
 trust_remote_code:
 # use_fast option for tokenizer loading from_pretrained, default to True
 tokenizer_use_fast:
-# resize the model embeddings when new tokens are added to multiples of 32
-# this is reported to improve training speed on some models
-resize_token_embeddings_to_32x:

 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
@@ -345,13 +328,12 @@ tf32: true # require >=ampere

 # a list of one or more datasets to finetune the model with
 datasets:
-  # hf dataset repo | "json" for local dataset, make sure to fill data_files
+  # this can be either a hf dataset, or relative path
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+    type: alpaca # format OR format:prompt_style (chat/instruct)
    data_files: # path to source data files
    shards: # number of shards to split data into
-    name: # name of dataset configuration to load

 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
@@ -359,7 +341,7 @@ dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
 # push checkpoints to hub
-hub_model_id: # repo path to push finetuned model
+push_to_hub_model_id: # repo path
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
@@ -421,9 +403,6 @@ logging_steps:
 save_steps:
 eval_steps:

-# save model as safetensors (require safetensors package)
-save_safetensors:
-
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
 # don't use this, leads to wonky training (according to someone on the internet)
@@ -515,6 +494,17 @@ strict:

 </details>

+### Accelerate
+
+Configure accelerate
+
+```bash
+accelerate config
+
+# Edit manually
+# nano ~/.cache/huggingface/accelerate/default_config.yaml
+```
+
 ### Train

 Run
@@ -522,21 +512,6 @@ Run
 accelerate launch scripts/finetune.py configs/your_config.yml
 ```

-#### Multi-GPU Config
-
- llama FSDP
-```yaml
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_offload_params: true
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
-```
-
- llama Deepspeed: append `ACCELERATE_USE_DEEPSPEED=true` in front of finetune command
-
 ### Inference

 Pass the appropriate flag to the train command:
@@ -587,10 +562,6 @@ Try set `fp16: true`

 Try to turn off xformers.

-> accelerate config missing
-
-It's safe to ignore it.
-
 ## Need help? 🙋♂️

 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -3,15 +3,16 @@ FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
-ARG CUDA="118"
-ENV BNB_CUDA_VERSION=$CUDA

 RUN apt-get update && \
    apt-get install -y vim curl

 WORKDIR /workspace

-RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main" \
+            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
+            "transformers @ git+https://github.com/huggingface/transformers.git@main"
+
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
@@ -21,10 +22,5 @@ RUN cd axolotl && \
        pip install -e .; \
    fi

-# fix so that git fetch/pull from remote works
-RUN cd axolotl && \
-    git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
 # helper for huggingface-login cli
 RUN git config --global credential.helper store
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -8,7 +8,7 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION a
 ENV PATH="/root/miniconda3/bin:${PATH}"

 ARG PYTHON_VERSION="3.9"
-ARG PYTORCH_VERSION="2.0.1"
+ARG PYTORCH="2.0.0"
 ARG CUDA="118"

 ENV PYTHON_VERSION=$PYTHON_VERSION
@@ -29,18 +29,17 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH} torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu$CUDA


 FROM base-builder AS flash-attn-builder

 WORKDIR /workspace

-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"

-RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
+RUN git clone https://github.com/HazyResearch/flash-attention.git && \
    cd flash-attention && \
-    git checkout v2.0.1  && \
    python3 setup.py bdist_wheel && \
    cd csrc/fused_dense_lib && \
    python3 setup.py bdist_wheel && \
@@ -53,7 +52,7 @@ RUN git clone https://github.com/Dao-AILab/flash-attention.git && \

 FROM base-builder AS deepspeed-builder

-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"

 WORKDIR /workspace

@@ -74,9 +73,6 @@ RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \

 FROM base-builder

-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-
 # recompile apex
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
@@ -101,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10
+    pip3 install -U --no-cache-dir pydantic
--- a/docker/Dockerfile-runpod
+++ b/docker/Dockerfile-runpod
@@ -1,10 +1,6 @@
 ARG BASE_TAG=main
 FROM winglian/axolotl:$BASE_TAG

-ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
-ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
-
 COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh

 RUN apt install --yes --no-install-recommends openssh-server tmux && \
--- a/examples/llama-2/README.md
+++ b/examples/llama-2/README.md
@@ -1,20 +0,0 @@
-# Overview
-
-This is an example of a llama-2 configuration for 7b and 13b. The yaml file contains configuration for the 7b variant, but you can just aswell use the same settings for 13b.
-
-The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
-
-The 13b variant will fit if you change these settings to these values:
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-
-```shell
-accelerate launch scripts/finetune.py examples/llama-2/qlora.yml
-
-```
-or
-
-```shell
-accelerate launch scripts/finetune.py examples/llama-2/lora.yml
-
-```
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -1,66 +0,0 @@
-base_model: meta-llama/Llama-2-7b-hf
-base_model_config: meta-llama/Llama-2-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./lora-out
-
-sequence_len: 4096
-max_packed_sequence_len: 4096
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: true
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention: true
-flash_attention:
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
-  pad_token: "<pad>"
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -1,67 +0,0 @@
-base_model: meta-llama/Llama-2-7b-hf
-base_model_config: meta-llama/Llama-2-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 4096
-max_packed_sequence_len: 4096
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: paged_adamw_32bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: true
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention: true
-flash_attention:
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
-  pad_token: "<pad>"
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -1,90 +0,0 @@
-# An example finetuning Saleforce's XGen-7b model with 8k context using qlora
-# on Tim Dettmer's Guanaco dataset.
-base_model: Salesforce/xgen-7b-8k-base
-base_model_config: Salesforce/xgen-7b-8k-base
-trust_remote_code: true
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: false
-# enable 4bit for QLoRA
-load_in_4bit: true
-gptq: false
-strict: false
-push_dataset_to_hub:
-datasets:
-  - path: timdettmers/openassistant-guanaco
-    data_files:
-      - openassistant_best_replies_train.jsonl
-    type: "completion"
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-# enable QLoRA
-adapter: qlora
-lora_model_dir:
-sequence_len: 8192
-max_packed_sequence_len:
-
-# hyperparameters from QLoRA paper Appendix B.2
-# "We find hyperparameters to be largely robust across datasets"
-lora_r: 64
-lora_alpha: 16
-# 0.1 for models up to 13B
-# 0.05 for 33B and 65B models
-lora_dropout: 0.05
-# add LoRA modules on all linear layers of the base model
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./qlora-out
-
-# QLoRA paper Table 9
-# - 16 for 7b & 13b
-# - 32 for 33b, 64 for 64b
-# Max size tested on A6000
-# - 7b: 40
-# - 40b: 4
-# decrease if OOM, increase for max VRAM utilization
-micro_batch_size: 1
-gradient_accumulation_steps: 1
-num_epochs: 3
-# Optimizer for QLoRA
-optimizer: paged_adamw_32bit
-torchdistx_path:
-lr_scheduler: cosine
-# QLoRA paper Table 9
-# - 2e-4 for 7b & 13b
-# - 1e-4 for 33b & 64b
-learning_rate: 0.00002
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-gradient_checkpointing: true
-# stop training after this many evaluation losses have increased in a row
-# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-early_stopping_patience: 3
-resume_from_checkpoint:
-auto_resume_from_checkpoints: true
-local_rank:
-logging_steps: 1
-xformers_attention: true
-flash_attention:
-gptq_groupsize:
-gptq_model_v1:
-warmup_steps: 10
-eval_steps: 50
-save_steps: 50
-debug:
-deepspeed:
-weight_decay: 0.0
-special_tokens:
-  eos_token: "<|endoftext|>"
-  bos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
-  pad_token: "<|endoftext|>"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.39.0
-accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
+accelerate
 addict
 fire
 PyYAML==6.0
@@ -12,7 +12,6 @@ wandb
 einops
 xformers
 optimum
-hf_transfer
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
--- a/scripts/alpaca_json_to_jsonl.py
+++ b/scripts/alpaca_json_to_jsonl.py
@@ -15,9 +15,6 @@ from axolotl.convert import (
    JsonToJsonlConverter,
    StdoutWriter,
 )
-from axolotl.logging_config import configure_logging
-
-configure_logging()

 # add src to the pythonpath so we don't need to pip install this
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -17,7 +17,6 @@ import yaml
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer

-from axolotl.logging_config import configure_logging
 from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
@@ -30,12 +29,9 @@ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)

-configure_logging()
-LOG = logging.getLogger("axolotl.scripts")
-

+logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


 def choose_device(cfg):
@@ -216,7 +212,7 @@ def train(

    # load the tokenizer first
    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
-    LOG.info(f"loading tokenizer... {tokenizer_config}")
+    logging.info(f"loading tokenizer... {tokenizer_config}")
    tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)

    if (
@@ -238,7 +234,7 @@ def train(
            eval_dataset = None

    if cfg.debug or "debug" in kwargs:
-        LOG.info("check_dataset_labels...")
+        logging.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
                [random.randrange(0, len(train_dataset) - 1) for _ in range(5)]  # nosec
@@ -247,11 +243,11 @@ def train(
        )

    if prepare_ds_only:
-        LOG.info("Finished preparing dataset. Exiting...")
+        logging.info("Finished preparing dataset. Exiting...")
        return

    # Load the model and tokenizer
-    LOG.info("loading model and peft_config...")
+    logging.info("loading model and peft_config...")
    model, peft_config = load_model(
        cfg.base_model,
        cfg.base_model_config,
@@ -262,17 +258,17 @@ def train(
    )

    if "merge_lora" in kwargs and cfg.adapter is not None:
-        LOG.info("running merge of LoRA with base model")
+        logging.info("running merge of LoRA with base model")
        model = model.merge_and_unload()
        model.to(dtype=torch.float16)

        if cfg.local_rank == 0:
-            LOG.info("saving merged model")
+            logging.info("saving merged model")
            model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
        return

    if cfg.inference:
-        LOG.info("calling do_inference function")
+        logging.info("calling do_inference function")
        prompter: Optional[str] = "AlpacaPrompter"
        if "prompter" in kwargs:
            if kwargs["prompter"] == "None":
@@ -291,12 +287,12 @@ def train(
    model.config.use_cache = False

    if torch.__version__ >= "2" and sys.platform != "win32":
-        LOG.info("Compiling torch model")
+        logging.info("Compiling torch model")
        model = torch.compile(model)

    # go ahead and presave, so we have the adapter config available to inspect
    if peft_config:
-        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
+        logging.info(f"Pre-saving adapter config to {cfg.output_dir}")
        peft_config.save_pretrained(cfg.output_dir)

    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
@@ -312,9 +308,9 @@ def train(
            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
        )

-    LOG.info("Starting trainer...")
+    logging.info("Starting trainer...")
    if cfg.group_by_length:
-        LOG.info("hang tight... sorting dataset for group_by_length")
+        logging.info("hang tight... sorting dataset for group_by_length")
    resume_from_checkpoint = cfg.resume_from_checkpoint
    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
        possible_checkpoints = [
@@ -326,7 +322,7 @@ def train(
                key=lambda path: int(path.split("-")[-1]),
            )
            resume_from_checkpoint = sorted_paths[-1]
-            LOG.info(
+            logging.info(
                f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
            )

@@ -340,13 +336,11 @@ def train(
    else:
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)

-    LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
+    logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
-    if cfg.fsdp:
-        model.save_pretrained(cfg.output_dir)
-    elif cfg.local_rank == 0:
+    if cfg.local_rank == 0:
        if cfg.flash_optimum:
            model = BetterTransformer.reverse(model)
        model.save_pretrained(cfg.output_dir)
--- a/scripts/runpod-entrypoint.sh
+++ b/scripts/runpod-entrypoint.sh
@@ -1,21 +1,10 @@
 #!/bin/bash

-# Export specific ENV variables to /etc/rp_environment
-echo "Exporting environment variables..."
-printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
-echo 'source /etc/rp_environment' >> ~/.bashrc
+echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
+chmod 700 -R ~/.ssh

-if [[ $PUBLIC_KEY ]]
-then
-    mkdir -p ~/.ssh
-    chmod 700 ~/.ssh
-    echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
-    chmod 700 -R ~/.ssh
-    # Start the SSH service in the background
-    service ssh start
-else
-    echo "No PUBLIC_KEY ENV variable provided, not starting openSSH daemon"
-fi
+# Start the SSH service in the background
+service ssh start

 # Execute the passed arguments (CMD)
 exec "$@"
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,13 +1,12 @@
 """Module containing Dataset functionality"""

 import logging
-import os
 from typing import List

 import torch
 from datasets import IterableDataset

-from .prompt_tokenizers import PromptTokenizingStrategy
+from .prompt_tokenizers import InvalidDataException, PromptTokenizingStrategy

 # We want this to be a wrapper for an existing dataset that we have loaded
 # lets use the concept of middlewares to wrap each dataset, for example
@@ -15,8 +14,6 @@ from .prompt_tokenizers import PromptTokenizingStrategy
 # let's check to ensure we don't truncate an item in the middle, we'll use
 # the collators later on to pad the datasets

-LOG = logging.getLogger("axolotl")
-

 class TokenizedPromptDataset(IterableDataset):
    """
@@ -35,15 +32,17 @@ class TokenizedPromptDataset(IterableDataset):
        self.dataset = dataset

    def __iter__(self):
-        features = self.dataset.features.keys()
-        num_proc = os.cpu_count()
-        return iter(
-            self.dataset.map(
-                self.prompt_tokenizer.tokenize_prompt,
-                num_proc=num_proc,
-                remove_columns=features,
-            )
-        )
+        iterator = iter(self.dataset)
+        count = 0
+        # Loop through the entire dataset
+        for example in iterator:
+            try:
+                yield self.prompt_tokenizer.tokenize_prompt(example)
+                count += 1
+            except InvalidDataException:
+                pass
+        if count == 0:
+            raise RuntimeError("Expected at least one datapoint in dataset.")


 # TODO this isn't the best since it can't interleave datasets
@@ -116,7 +115,7 @@ class ConstantLengthDataset(IterableDataset):
                                "attention_mask": attention_mask,
                            }
                        else:
-                            LOG.warning(
+                            logging.warning(
                                f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
                            )
                    buffer = {
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -8,7 +8,7 @@ import torch
 import transformers
 from einops import rearrange
 from flash_attn.bert_padding import pad_input, unpad_input
-from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
 from transformers.models.llama.modeling_llama import apply_rotary_pos_emb


@@ -79,7 +79,7 @@ def forward(
            dtype=torch.int32,
            device=qkv.device,
        )
-        output = flash_attn_varlen_qkvpacked_func(
+        output = flash_attn_unpadded_qkvpacked_func(
            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
        )
        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
@@ -95,7 +95,7 @@ def forward(
            three=3,
            h=nheads,
        )
-        output_unpad = flash_attn_varlen_qkvpacked_func(
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
            x_unpad,
            cu_q_lens,
            max_s,
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -1,33 +0,0 @@
-"""Logging configuration settings"""
-
-import os
-import sys
-from logging.config import dictConfig
-from typing import Any, Dict
-
-DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
-    "version": 1,
-    "formatters": {
-        "simple": {
-            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
-        },
-    },
-    "filters": {},
-    "handlers": {
-        "console": {
-            "class": "logging.StreamHandler",
-            "formatter": "simple",
-            "filters": [],
-            "stream": sys.stdout,
-        },
-    },
-    "root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
-    "loggers": {
-        "axolotl": {"handlers": ["console"], "level": "DEBUG", "propagate": False},
-    },
-}
-
-
-def configure_logging():
-    """Configure with default logging"""
-    dictConfig(DEFAULT_LOGGING_CONFIG)
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -7,7 +7,6 @@ import math
 from typing import Optional, Tuple

 import torch
-import torch.nn.functional as F
 import transformers.models.llama.modeling_llama
 from torch import nn

@@ -39,48 +38,21 @@ def xformers_forward(
    # pylint: disable=duplicate-code
    bsz, q_len, _ = hidden_states.size()

-    if not hasattr(self, "pretraining_tp"):
-        self.pretraining_tp = 1
-
-    if self.pretraining_tp > 1:
-        key_value_slicing = (
-            self.num_key_value_heads * self.head_dim
-        ) // self.pretraining_tp
-        query_slices = self.q_proj.weight.split(
-            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
-        )
-        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-        query_states = [
-            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        query_states = torch.cat(query_states, dim=-1)
-
-        key_states = [
-            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        key_states = torch.cat(key_states, dim=-1)
-
-        value_states = [
-            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        value_states = torch.cat(value_states, dim=-1)
-
-    else:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(
-        bsz, q_len, self.num_heads, self.head_dim
-    ).transpose(1, 2)
-    key_states = key_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    value_states = value_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
@@ -101,14 +73,6 @@ def xformers_forward(

    past_key_value = (key_states, value_states) if use_cache else None

-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = transformers.models.llama.modeling_llama.repeat_kv(
-        key_states, self.num_key_value_groups
-    )
-    value_states = transformers.models.llama.modeling_llama.repeat_kv(
-        value_states, self.num_key_value_groups
-    )
-
    # We only apply xformers optimizations if we don't need to output the whole attention matrix
    if not output_attentions:
        query_states = query_states.transpose(1, 2)
@@ -164,23 +128,10 @@ def xformers_forward(
                f" {attn_output.size()}"
            )

-        attn_output = attn_output.transpose(1, 2).contiguous()
-        # end x-formers vs. not x-formers if-else block
+        attn_output = attn_output.transpose(1, 2)

    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    if self.pretraining_tp > 1:
-        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-        o_proj_slices = self.o_proj.weight.split(
-            self.hidden_size // self.pretraining_tp, dim=1
-        )
-        attn_output = sum(
-            F.linear(attn_output[i], o_proj_slices[i])
-            for i in range(self.pretraining_tp)
-        )
-    else:
-        attn_output = self.o_proj(attn_output)
-
+    attn_output = self.o_proj(attn_output)
    return attn_output, attn_weights, past_key_value


@@ -233,15 +184,14 @@ def sdp_attention_forward(

    # We only apply sdp attention if we don't need to output the whole attention matrix
    if not output_attentions:
-        with torch.backends.cuda.sdp_kernel():
-            attn_output = torch.nn.functional.scaled_dot_product_attention(
-                query_states,
-                key_states,
-                value_states,
-                attn_mask=attention_mask,
-                is_causal=False,
-            )
-            attn_weights = None
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            is_causal=False,
+        )
+        attn_weights = None
    else:
        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
--- a/src/axolotl/monkeypatch/llama_landmark_attn.py
+++ b/src/axolotl/monkeypatch/llama_landmark_attn.py
@@ -53,7 +53,7 @@ from transformers.utils import (
    replace_return_docstrings,
 )

-LOG = logging.getLogger("axolotl")
+logger = logging.get_logger(__name__)

 _CONFIG_FOR_DOC = "LlamaConfig"

@@ -862,7 +862,7 @@ class LlamaModel(LlamaPreTrainedModel):

        if self.gradient_checkpointing and self.training:
            if use_cache:
-                LOG.warning_once(
+                logger.warning_once(
                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                )
                use_cache = False
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -66,34 +66,15 @@ class SystemDataPrompter(AlpacaPrompter):
    ) -> Generator[str, None, None]:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
-        formatted_sys_prompt = f"### System:\n{system}\n\n" if system else ""
        if input:
-            res = formatted_sys_prompt + self.turn_format.format(
-                instruction=instruction, input=input
-            )
+            res = system + self.turn_format.format(instruction=instruction, input=input)
        else:
-            res = formatted_sys_prompt + self.turn_no_input_format.format(
-                instruction=instruction
-            )
+            res = system + self.turn_no_input_format.format(instruction=instruction)
        if output:
            res = f"{res}{output}"
        yield res


-class OpenOrcaSystemDataPrompter(SystemDataPrompter):
-    """
-    Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts
-    """
-
-    def match_prompt_style(self):
-        if self.prompt_style == PromptStyle.INSTRUCT.value:
-            self.turn_format = "### User:\n{instruction}\n\n### Additional Context:\n{input}\n\n### Assistant:\n"
-            self.turn_no_input_format = "### User:\n{instruction}\n\n### Assistant:\n"
-        if self.prompt_style == PromptStyle.CHAT.value:
-            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
-            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
-
-
 class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
    """
    Tokenizing strategy for OpenOrca datasets
@@ -132,7 +113,7 @@ def load_chat(tokenizer, cfg):

 def load_open_orca(tokenizer, cfg):
    return OpenOrcaPromptTokenizingStrategy(
-        OpenOrcaSystemDataPrompter(PromptStyle.INSTRUCT.value),
+        SystemDataPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
--- a/src/axolotl/prompt_strategies/pygmalion.py
+++ b/src/axolotl/prompt_strategies/pygmalion.py
@@ -11,8 +11,6 @@ from axolotl.prompt_tokenizers import (
    tokenize_prompt_default,
 )

-LOG = logging.getLogger("axolotl")
-
 IGNORE_TOKEN_ID = -100


@@ -66,7 +64,7 @@ class PygmalionPromptTokenizingStrategy(PromptTokenizingStrategy):
                    *copy.deepcopy(res["input_ids"])
                ][len(self.bot_prefix_token_ids) :]
            else:
-                LOG.warning(f"unknown role in conversation: {role}")
+                logging.warning(f"unknown role in conversation: {role}")
                res = defaultdict(lambda: [])

            # pylint: disable=duplicate-code
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -10,8 +10,6 @@ from transformers import PreTrainedTokenizer

 from axolotl.prompters import IGNORE_TOKEN_ID

-LOG = logging.getLogger("axolotl")
-
 IGNORE_INDEX = -100
 LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"  # nosec
 LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
@@ -48,22 +46,16 @@ class PromptTokenizingStrategy(abc.ABC):

    @functools.lru_cache(maxsize=128)
    def _get_user_token(self):
-        try:
-            id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
-            if isinstance(id_or_ids, (int,)):
-                return id_or_ids
-        except KeyError:
-            pass
+        id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
+        if isinstance(id_or_ids, (int,)):
+            return id_or_ids
        return False

    @functools.lru_cache(maxsize=128)
    def _get_assistant_token(self):
-        try:
-            id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
-            if isinstance(id_or_ids, (int,)):
-                return id_or_ids
-        except KeyError:
-            pass
+        id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
+        if isinstance(id_or_ids, (int,)):
+            return id_or_ids
        return False

    def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
@@ -392,7 +384,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                    else:
-                        LOG.warning(f"unhandled role: {part[0]}")
+                        logging.warning(f"unhandled role: {part[0]}")

                # pylint: disable=duplicate-code
                result, current_len = parse_tokenized_to_result(
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -5,7 +5,6 @@ import logging
 from enum import Enum, auto
 from typing import Generator, List, Optional, Tuple, Union

-LOG = logging.getLogger("axolotl")
 IGNORE_TOKEN_ID = -100


@@ -242,7 +241,7 @@ class Conversation:
            if message:
                yield (role + ":", " " + message)
            else:
-                LOG.warning(f"role with empty message: {role}")
+                logging.warning(f"role with empty message: {role}")
                yield (role + ":", "")

    def copy(self):
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,6 +1,5 @@
 """Module containing data utilities"""
 import functools
-import itertools
 import logging
 from hashlib import md5
 from pathlib import Path
@@ -36,11 +35,9 @@ from axolotl.prompters import (
    SummarizeTLDRPrompter,
 )

-LOG = logging.getLogger("axolotl")
-

 def load_tokenized_prepared_datasets(
-    tokenizer, cfg, default_dataset_prepared_path
+    split, tokenizer, cfg, default_dataset_prepared_path
 ) -> DatasetDict:
    tokenizer_name = tokenizer.__class__.__name__
    ds_hash = str(
@@ -52,6 +49,8 @@ def load_tokenized_prepared_datasets(
                    sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
                )
                + "|"
+                + split
+                + "|"
                + tokenizer_name
            ).encode("utf-8")
        ).hexdigest()
@@ -69,24 +68,24 @@ def load_tokenized_prepared_datasets(
                f"{cfg.push_dataset_to_hub}/{ds_hash}",
                use_auth_token=use_auth_token,
            )
-            dataset = dataset["train"]
+            dataset = dataset[split]
    except Exception:  # pylint: disable=broad-except # nosec
        pass

    if dataset:
        ...
    elif any(prepared_ds_path.glob("*")):
-        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
+        logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
        dataset = load_from_disk(str(prepared_ds_path))
-        LOG.info("Prepared dataset loaded from disk...")
+        logging.info("Prepared dataset loaded from disk...")
    else:
-        LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
-        LOG.info("Loading raw datasets...")
+        logging.info(f"Unable to find prepared dataset in {prepared_ds_path}")
+        logging.info("Loading raw datasets...")

        if cfg.seed:
            seed = cfg.seed
        else:
-            LOG.info("No seed provided, using default seed of 42")
+            logging.info("No seed provided, using default seed of 42")
            seed = 42

        datasets = []
@@ -97,7 +96,6 @@ def load_tokenized_prepared_datasets(
            try:
                load_dataset(
                    d.path,
-                    name=d.name,
                    streaming=True,
                    use_auth_token=use_auth_token,
                )
@@ -106,51 +104,40 @@ def load_tokenized_prepared_datasets(
                pass

            # prefer local dataset, even if hub exists
-            local_path = Path(d.path)
-            if local_path.exists():
-                if local_path.is_dir():
+            if Path(d.path).exists():
+                ds = load_dataset(
+                    "json",
+                    data_files=d.path,
+                    streaming=False,
+                    split=None,
+                )
+            elif ds_from_hub:
+                if d.data_files:
                    ds = load_dataset(
                        d.path,
-                        name=d.name,
+                        streaming=False,
                        data_files=d.data_files,
-                        streaming=False,
-                        split=None,
-                    )
-                elif local_path.is_file():
-                    ds = load_dataset(
-                        "json",
-                        name=d.name,
-                        data_files=d.path,
-                        streaming=False,
-                        split=None,
+                        use_auth_token=use_auth_token,
                    )
                else:
-                    raise ValueError(
-                        "unhandled dataset load: local path exists, but is neither a directory or a file"
+                    ds = load_dataset(
+                        d.path,
+                        streaming=False,
+                        use_auth_token=use_auth_token,
                    )
-            elif ds_from_hub:
-                ds = load_dataset(
-                    d.path,
-                    name=d.name,
-                    streaming=False,
-                    data_files=d.data_files,
-                    use_auth_token=use_auth_token,
-                )
            else:
                fp = hf_hub_download(
                    repo_id=d.path,
                    repo_type="dataset",
                    filename=d.data_files,
                )
-                ds = load_dataset(
-                    "json", name=d.name, data_files=fp, streaming=False, split=None
-                )
+                ds = load_dataset("json", data_files=fp, streaming=False, split=None)
            if not ds:
                raise ValueError("unhandled dataset load")
            # support for using a subset of the data
            if d.shards:
-                if "train" in ds:
-                    ds = ds.shuffle(seed=seed)["train"].shard(
+                if split in ds:
+                    ds = ds.shuffle(seed=seed)[split].shard(
                        num_shards=d.shards, index=0
                    )
                else:
@@ -159,8 +146,8 @@ def load_tokenized_prepared_datasets(
            d_type_split = d_type.split(":")
            d_base_type = d_type_split[0]
            d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
-            if "train" in ds:
-                ds = ds["train"]
+            if split in ds:
+                ds = ds[split]
            if ds_strategy := load(d.type, tokenizer, cfg):
                ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
                datasets.append(ds_wrapper)
@@ -258,29 +245,25 @@ def load_tokenized_prepared_datasets(
                suffix = ""
                if ":load_" in d.type:
                    suffix = f" Did you mean {d.type.replace(':load_', '.load_')}?"
-                LOG.error(f"unhandled prompt tokenization strategy: {d.type}. {suffix}")
+                logging.error(
+                    f"unhandled prompt tokenization strategy: {d.type}. {suffix}"
+                )
                raise ValueError(
                    f"unhandled prompt tokenization strategy: {d.type} {suffix}"
                )
-        LOG.info("tokenizing, merging, and shuffling master dataset")
+        logging.info("tokenizing, merging, and shuffling master dataset")

        samples: List[int] = []
-        chunk_size = 1000
        for d in datasets:
-            d_iter = iter(d)
-            while True:
-                chunk = list(itertools.islice(d_iter, chunk_size))
-                if not chunk:
-                    break
-                samples.extend(chunk)
-
-        LOG.info("shuffle")
+            samples = samples + list(d)
        dataset = Dataset.from_list(samples).shuffle(seed=seed)
        if cfg.local_rank == 0:
-            LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
+            logging.info(
+                f"Saving merged prepared dataset to disk... {prepared_ds_path}"
+            )
            dataset.save_to_disk(prepared_ds_path)
            if cfg.push_dataset_to_hub:
-                LOG.info(
+                logging.info(
                    f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset.push_to_hub(
@@ -331,53 +314,63 @@ def load_prepare_datasets(
        use_auth_token = cfg.hf_use_auth_token
        try:
            if cfg.push_dataset_to_hub:
-                LOG.info(
+                logging.info(
                    f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset = load_dataset(
                    f"{cfg.push_dataset_to_hub}/{ds_hash}",
                    use_auth_token=use_auth_token,
                )
-                dataset = dataset["train"]
        except Exception:  # pylint: disable=broad-except # nosec
            pass

        if dataset:
            ...
        elif any(prepared_ds_path.glob("*")):
-            LOG.info(
+            logging.info(
                f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
            )
            dataset = load_from_disk(str(prepared_ds_path))
-            LOG.info("Prepared packed dataset loaded from disk...")
+            logging.info("Prepared packed dataset loaded from disk...")
            if cfg.push_dataset_to_hub:
-                LOG.info(
+                logging.info(
                    f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset.push_to_hub(
                    f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
                )
        else:
-            dataset = load_tokenized_prepared_datasets(
-                tokenizer, cfg, default_dataset_prepared_path
+            dataset_train = load_tokenized_prepared_datasets(
+                "train", tokenizer, cfg, default_dataset_prepared_path
            )
-
+            dataset_test = load_tokenized_prepared_datasets(
+                "test", tokenizer, cfg, default_dataset_prepared_path
+            )
+            dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
            if cfg.seed:
                dataset = dataset.shuffle(seed=cfg.seed)

-            constant_len_dataset = ConstantLengthDataset(
+            constant_len_dataset_train = ConstantLengthDataset(
                tokenizer,
-                [dataset],
+                [dataset["train"]],
                seq_length=max_packed_sequence_len,
            )
-            LOG.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
-            dataset = Dataset.from_list(list(constant_len_dataset))
+            constant_len_dataset_test = ConstantLengthDataset(
+                tokenizer,
+                [dataset["test"]],
+                seq_length=max_packed_sequence_len,
+            )
+            logging.info(
+                f"packing master dataset to len: {cfg.max_packed_sequence_len}"
+            )
+            dataset_train = Dataset.from_list(list(constant_len_dataset_train))
+            dataset_test = Dataset.from_list(list(constant_len_dataset_test))

            # filter out bad data
-            dataset = Dataset.from_list(
+            dataset_train = Dataset.from_list(
                [
                    d
-                    for d in dataset
+                    for d in dataset_train
                    if len(d["input_ids"]) < cfg.sequence_len
                    and len(d["input_ids"]) > 0
                    and len(d["input_ids"]) == len(d["attention_mask"])
@@ -385,13 +378,26 @@ def load_prepare_datasets(
                ]
            )

+            # filter out bad data
+            dataset_test = Dataset.from_list(
+                [
+                    d
+                    for d in dataset_test
+                    if len(d["input_ids"]) < cfg.sequence_len
+                    and len(d["input_ids"]) > 0
+                    and len(d["input_ids"]) == len(d["attention_mask"])
+                    and len(d["input_ids"]) == len(d["labels"])
+                ]
+            )
+            dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
+
            if cfg.local_rank == 0:
-                LOG.info(
+                logging.info(
                    f"Saving packed prepared dataset to disk... {prepared_ds_path}"
                )
                dataset.save_to_disk(prepared_ds_path)
                if cfg.push_dataset_to_hub:
-                    LOG.info(
+                    logging.info(
                        f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                    )
                    dataset.push_to_hub(
@@ -399,12 +405,17 @@ def load_prepare_datasets(
                        private=True,
                    )
    else:
+        # dataset_train = load_tokenized_prepared_datasets(
        dataset = load_tokenized_prepared_datasets(
-            tokenizer, cfg, default_dataset_prepared_path
+            "train", tokenizer, cfg, default_dataset_prepared_path
        )
+        # dataset_test = load_tokenized_prepared_datasets(
+        #     "test", tokenizer, cfg, default_dataset_prepared_path
+        # )
+        # dataset = DatasetDict({"train": dataset_train, "test": dataset_test})

    if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
-        LOG.info(
+        logging.info(
            f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
        )
        dataset = dataset.shard(
@@ -416,6 +427,9 @@ def load_prepare_datasets(
        dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
        train_dataset = dataset["train"]
        eval_dataset = dataset["test"]
+    elif "train" in dataset:
+        train_dataset = dataset["train"]
+        eval_dataset = dataset["test"]
    else:
        train_dataset = dataset
        eval_dataset = None
@@ -525,7 +539,7 @@ def encode_pretraining(tokenizer, max_tokens, examples):
        "attention_mask": [seq.tolist() for seq in new_attention_mask],
    }

-    LOG.debug(len(ret["input_ids"]))
+    logging.debug(len(ret["input_ids"]))
    return ret


--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -23,8 +23,6 @@ from transformers import (  # noqa: F401

 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN

-LOG = logging.getLogger("axolotl")
-
 if TYPE_CHECKING:
    from peft import PeftConfig  # noqa: F401

@@ -52,10 +50,10 @@ def load_tokenizer(
            use_fast=use_fast,
        )

-    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+    logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+    logging.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+    logging.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+    logging.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")

    if tokenizer.__class__.__name__ in [
        "LlamaTokenizer",
@@ -92,25 +90,23 @@ def load_model(

    if cfg.is_llama_derived_model and cfg.flash_attention:
        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
-            from axolotl.monkeypatch.llama_attn_hijack_flash import (
-                replace_llama_attn_with_flash_attn,
-            )
+            from axolotl.flash_attn import replace_llama_attn_with_flash_attn

-            LOG.info("patching with flash attention")
+            logging.info("patching with flash attention")
            replace_llama_attn_with_flash_attn()
    elif cfg.is_llama_derived_model and cfg.xformers_attention:
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_attention,
        )

-        LOG.info("patching with xformers attention")
+        logging.info("patching with xformers attention")
        hijack_llama_attention()
    elif cfg.is_llama_derived_model and cfg.sdp_attention:
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_sdp_attention,
        )

-        LOG.info("patching with sdp attention")
+        logging.info("patching with sdp attention")
        hijack_llama_sdp_attention()
    elif cfg.is_llama_derived_model and cfg.landmark_attention:
        from axolotl.monkeypatch.llama_landmark_attn import (
@@ -118,7 +114,7 @@ def load_model(
            patch_llama_with_landmark_attn,
        )

-        LOG.info("patching with landmark attention")
+        logging.info("patching with landmark attention")
        patch_llama_with_landmark_attn()

        # Note: This might overwrite previous additional_special_tokens
@@ -129,7 +125,7 @@ def load_model(
            replace_llama_rope_with_xpos_rope,
        )

-        LOG.info("patching with xpos rope")
+        logging.info("patching with xpos rope")
        replace_llama_rope_with_xpos_rope()

    if cfg.bf16 or cfg.bfloat16:
@@ -146,24 +142,18 @@ def load_model(

            replace_peft_model_with_int4_lora_model()
    except Exception as err:
-        LOG.exception(err)
+        logging.exception(err)
        raise err

-    if not cfg.gptq and (
-        (cfg.adapter == "lora" and load_in_8bit)
-        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
-    ):
-        try:
-            from peft import prepare_model_for_kbit_training
-        except ImportError:
-            # For backward compatibility
-            from peft import (
-                prepare_model_for_int8_training as prepare_model_for_kbit_training,
-            )
+    try:
+        from peft import prepare_model_for_kbit_training
+    except ImportError:
+        # For backward compatibility
+        from peft import (
+            prepare_model_for_int8_training as prepare_model_for_kbit_training,
+        )

    model_kwargs = {}
-    if cfg.model_revision:
-        model_kwargs["revision"] = cfg.model_revision
    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
@@ -195,7 +185,7 @@ def load_model(
                if len(files) > 0:
                    model_path = str(files[0])
                else:
-                    LOG.warning(
+                    logging.warning(
                        "unable to find a cached model file, this will likely fail..."
                    )
                    model_path = str(cache_model_path)
@@ -212,7 +202,7 @@ def load_model(
                else True,
            )
            load_in_8bit = False
-        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
+        elif cfg.is_llama_derived_model:
            from transformers import LlamaForCausalLM

            config = LlamaConfig.from_pretrained(base_model_config)
@@ -251,7 +241,7 @@ def load_model(
        #         device=cfg.device,
        #     )
        #     model.train() # sets to train instead of eval mode
-        elif model_type and not cfg.trust_remote_code:
+        elif model_type:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
@@ -274,14 +264,14 @@ def load_model(
                and cfg.sequence_len > config.max_seq_len
            ):
                config.max_seq_len = cfg.sequence_len
-                LOG.warning(f"increasing context length to {cfg.sequence_len}")
+                logging.warning(f"increasing context length to {cfg.sequence_len}")
            elif (
                hasattr(config, "max_sequence_length")
                and config.max_sequence_length
                and cfg.sequence_len > config.max_sequence_length
            ):
                config.max_sequence_length = cfg.sequence_len
-                LOG.warning(f"increasing context length to {cfg.sequence_len}")
+                logging.warning(f"increasing context length to {cfg.sequence_len}")
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                config=config,
@@ -293,10 +283,10 @@ def load_model(
                **model_kwargs,
            )
    except Exception as err:  # pylint: disable=broad-exception-caught
-        LOG.error(
+        logging.error(
            "Exception raised attempting to load model, retrying with AutoModelForCausalLM"
        )
-        LOG.exception(err)
+        logging.exception(err)
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
@@ -307,11 +297,7 @@ def load_model(
            **model_kwargs,
        )

-    embeddings_len = (
-        math.ceil(len(tokenizer) / 32) * 32
-        if cfg.resize_token_embeddings_to_32x
-        else len(tokenizer)
-    )
+    embeddings_len = math.ceil(len(tokenizer) / 32) * 32
    model.resize_token_embeddings(embeddings_len)

    if (
@@ -319,7 +305,7 @@ def load_model(
        and model.config.max_position_embeddings
        and cfg.sequence_len >= model.config.max_position_embeddings
    ):
-        LOG.warning(
+        logging.warning(
            f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
        )
        model.config.max_position_embeddings = cfg.sequence_len
@@ -328,21 +314,11 @@ def load_model(
        (cfg.adapter == "lora" and load_in_8bit)
        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
    ):
-        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
+        logging.info("converting PEFT model w/ prepare_model_for_kbit_training")
        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=cfg.gradient_checkpointing
        )

-        # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
-        # convert them back to fp16/bf16 for flash-attn compatibility.
-        if cfg.flash_attention and cfg.is_llama_derived_model:
-            for name, module in model.named_modules():
-                if "norm" in name:
-                    module.to(torch_dtype)
-                if "lm_head" in name or "embed_tokens" in name:
-                    if hasattr(module, "weight"):
-                        module.to(torch_dtype)
-
    model, lora_config = load_adapter(model, cfg, adapter)

    if cfg.ddp and not load_in_8bit:
@@ -350,7 +326,7 @@ def load_model(

    if cfg.gptq:
        # Scales to half
-        LOG.info("Fitting 4bit scales and zeros to half")
+        logging.info("Fitting 4bit scales and zeros to half")
        for _, module in model.named_modules():
            if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
                type(module)
@@ -376,7 +352,7 @@ def load_model(
        if param.requires_grad:
            requires_grad.append(f"{name}: {param.requires_grad}")
    if len(requires_grad) == 0:
-        LOG.warning("there are no parameters that require gradient updates")
+        logging.warning("there are no parameters that require gradient updates")
    model.config.use_cache = False

    if cfg.flash_optimum:
@@ -410,7 +386,7 @@ def load_llama_adapter(model, cfg):
    )

    if cfg.lora_model_dir:
-        LOG.info("Loading pretained LORA")
+        logging.info("Loading pretained LORA")
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
@@ -457,7 +433,7 @@ def load_lora(model, cfg):
            bits = 8

        linear_names = find_all_linear_names(bits, model)
-        LOG.info(f"found linear modules: {repr(linear_names)}")
+        logging.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))

    lora_config = LoraConfig(
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -1,9 +1,6 @@
 """Module for custom LRScheduler class"""
-import math
-from functools import partial

-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import LambdaLR, LRScheduler
+from torch.optim.lr_scheduler import LRScheduler


 class InterpolatingLogScheduler(LRScheduler):
@@ -45,58 +42,3 @@ class InterpolatingLogScheduler(LRScheduler):
            lrs = [self.max_lr for base_lr in self.base_lrs]

        return lrs
-
-
-def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
-    current_step: int,
-    *,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    num_cycles: float
-):
-    if current_step < num_warmup_steps:
-        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
-    progress = float(current_step - num_warmup_steps) / float(
-        max(1, num_training_steps - num_warmup_steps)
-    )
-    return max(
-        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
-    )
-
-
-def get_cosine_schedule_with_quadratic_warmup(
-    optimizer: Optimizer,
-    num_warmup_steps: int,
-    num_training_steps: int,
-    num_cycles: float = 0.5,
-    last_epoch: int = -1,
-):
-    """
-    Create a schedule with a learning rate that decreases following the values of the cosine function between the
-    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
-    initial lr set in the optimizer.
-
-    Args:
-        optimizer ([`~torch.optim.Optimizer`]):
-            The optimizer for which to schedule the learning rate.
-        num_warmup_steps (`int`):
-            The number of steps for the warmup phase.
-        num_training_steps (`int`):
-            The total number of training steps.
-        num_cycles (`float`, *optional*, defaults to 0.5):
-            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
-            following a half-cosine).
-        last_epoch (`int`, *optional*, defaults to -1):
-            The index of the last epoch when resuming training.
-
-    Return:
-        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
-    """
-
-    lr_lambda = partial(
-        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
-        num_warmup_steps=num_warmup_steps,
-        num_training_steps=num_training_steps,
-        num_cycles=num_cycles,
-    )
-    return LambdaLR(optimizer, lr_lambda, last_epoch)
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -5,8 +5,6 @@ import logging

 from termcolor import colored

-LOG = logging.getLogger("axolotl")
-

 def check_dataset_labels(dataset, tokenizer):
    # the dataset is already shuffled, so let's just check the first 5 elements
@@ -34,7 +32,7 @@ def check_example_labels(example, tokenizer):
        )
        colored_tokens.append(colored_token)

-    LOG.info(" ".join(colored_tokens))
-    LOG.info("\n\n\n")
+    logging.info(" ".join(colored_tokens))
+    logging.info("\n\n\n")

    return " ".join(colored_tokens)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -5,7 +5,6 @@ import logging
 import math
 import os
 import sys
-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Optional

@@ -14,70 +13,17 @@ import torch.cuda
 import transformers
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
-from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
+from transformers import EarlyStoppingCallback, Trainer
 from transformers.trainer_pt_utils import get_parameter_names

 from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    SavePeftModelCallback,
 )
-from axolotl.utils.schedulers import (
-    InterpolatingLogScheduler,
-    get_cosine_schedule_with_quadratic_warmup,
-)
-
-LOG = logging.getLogger("axolotl")
+from axolotl.utils.schedulers import InterpolatingLogScheduler


-@dataclass
-class AxolotlTrainingArguments(TrainingArguments):
-    """
-    Extend the base TrainingArguments for axolotl helpers
-    """
-
-    lr_quadratic_warmup: bool = field(
-        default=False,
-        metadata={"help": "Use quadratic warmup for cosine scheduling."},
-    )
-
-
-class AxolotlTrainer(Trainer):
-    """
-    Extend the base Trainer for axolotl helpers
-    """
-
-    args = None  # type: AxolotlTrainingArguments
-
-    def create_scheduler(
-        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
-    ):
-        """
-        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
-        passed as an argument.
-
-        Args:
-            num_training_steps (int): The number of training steps to do.
-            optimizer (torch.optim.Optimizer): The training optimizer
-        """
-
-        # fmt: off
-        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
-            # fmt: on
-            if (
-                self.args.lr_scheduler_type == "cosine"
-                and self.args.lr_quadratic_warmup is True
-            ):
-                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
-                    optimizer,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                    num_training_steps=num_training_steps,
-                )
-            else:
-                return super().create_scheduler(num_training_steps, optimizer)
-        return self.lr_scheduler
-
-
-class OneCycleLRSchedulerTrainer(AxolotlTrainer):
+class OneCycleLRSchedulerTrainer(Trainer):
    """
    Trainer subclass that uses the OneCycleLR scheduler
    """
@@ -157,9 +103,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        if cfg.fsdp_config:
            training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)

-    if cfg.lr_quadratic_warmup is not None:
-        training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
-
    # deepspeed
    if (
        os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -181,15 +124,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    if cfg.max_grad_norm:
        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm

-    if cfg.hub_model_id:
-        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
+    if cfg.push_to_hub_model_id:
+        training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id
        training_arguments_kwargs["push_to_hub"] = True
-        training_arguments_kwargs["hub_private_repo"] = True

-    if cfg.save_safetensors:
-        training_arguments_kwargs["save_safetensors"] = cfg.save_safetensors
-
-    training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
+    training_args = transformers.TrainingArguments(
        per_device_train_batch_size=cfg.micro_batch_size,
        per_device_eval_batch_size=cfg.eval_batch_size
        if cfg.eval_batch_size is not None
@@ -198,9 +137,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
        eval_accumulation_steps=cfg.gradient_accumulation_steps,
        num_train_epochs=cfg.num_epochs,
        learning_rate=cfg.learning_rate,
-        evaluation_strategy="steps" if cfg.val_set_size > 0 else "no",
+        evaluation_strategy="steps",
        save_strategy="steps" if cfg.save_steps else "epoch",
-        eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None,
+        eval_steps=cfg.eval_steps,
        save_steps=cfg.save_steps,
        output_dir=cfg.output_dir,
        save_total_limit=3,
@@ -327,7 +266,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):

        set_model_mem_id(model, tokenizer)

-        LOG.info("Adding landmark attention tokens to dataset")
+        logging.info("Adding landmark attention tokens to dataset")

        for dataset in [train_dataset, eval_dataset]:
            dataset = dataset.map(
@@ -339,7 +278,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
    trainer_cls = (
        OneCycleLRSchedulerTrainer
        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
-        else AxolotlTrainer
+        else transformers.Trainer
    )
    trainer = trainer_cls(
        model=model,
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -4,8 +4,6 @@ import logging

 import torch

-LOG = logging.getLogger("axolotl")
-

 def validate_config(cfg):
    if cfg.gradient_accumulation_steps and cfg.batch_size:
@@ -13,7 +11,7 @@ def validate_config(cfg):
            "please set only one of gradient_accumulation_steps or batch_size"
        )
    if cfg.batch_size:
-        LOG.warning(
+        logging.warning(
            "%s\n%s",
            "batch_size is not recommended. Please use gradient_accumulation_steps instead.",
            "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
@@ -46,10 +44,10 @@ def validate_config(cfg):
                raise ValueError("Require cfg.load_in_4bit to be True for qlora")

    if not cfg.load_in_8bit and cfg.adapter == "lora":
-        LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
+        logging.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")

    if cfg.trust_remote_code:
-        LOG.warning(
+        logging.warning(
            "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
        )

@@ -68,34 +66,31 @@ def validate_config(cfg):

    if cfg.flash_optimum is True:
        if cfg.adapter:
-            LOG.warning("BetterTransformers probably doesn't work with PEFT adapters")
+            logging.warning(
+                "BetterTransformers probably doesn't work with PEFT adapters"
+            )
        if cfg.fp16 or cfg.bf16:
            raise ValueError("AMP is not supported with BetterTransformer")
        if cfg.float16 is not True and cfg.bloat16 is not True:
-            LOG.warning(
+            logging.warning(
                "You should probably set bfloat16 or float16 to true to "
                "load the model in float16 for BetterTransformers"
            )
        if int(torch.__version__.split(".")[0]) < 2:
-            LOG.warning("torch>=2.0.0 required")
+            logging.warning("torch>=2.0.0 required")
            raise ValueError(
                f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
            )

    if cfg.pretraining_dataset and cfg.group_by_length:
-        LOG.warning(
+        logging.warning(
            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
        )

-    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
+    if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
        not cfg.optimizer or "adamw" not in cfg.optimizer
    ):
-        LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
-
-    if cfg.push_to_hub_model_id:
-        raise ValueError(
-            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
-        )
+        logging.warning("adamw hyperparameters found, but no adamw optimizer set")

    # TODO
    # MPT 7b
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -17,7 +17,7 @@ from axolotl.prompt_tokenizers import (
 )
 from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter

-LOG = logging.getLogger("axolotl")
+logging.basicConfig(level="INFO")


 class TestPromptTokenizationStrategies(unittest.TestCase):
@@ -130,9 +130,8 @@ class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
            "output": "Hi! How can I help?",
        }
        example = strat.tokenize_prompt(sample)
-        assert example["input_ids"][0:4] == [1, 835, 2184, 29901]  # "<s>### System:"
-        assert example["input_ids"][5:7] == [1509, 20118]  # "use cot"
-        assert example["input_ids"][9] == 11889  # USER
+        assert example["input_ids"][0:3] == [1, 671, 20118]  # <s>use cot
+        assert example["input_ids"][3] == 11889  # USER


 if __name__ == "__main__":
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -70,7 +70,7 @@ class AlpacaPrompterTest(unittest.TestCase):
            )
        )
        assert "use cot" in res
-        assert res.startswith("### System:")
+        assert res.startswith("use cot")
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "alpacas" in res
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": None,
-                "adam_epsilon": 0.0001,
+                "adamw_epsilon": 0.0001,
            }
        )

@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adafactor",
-                "adam_beta1": 0.0001,
+                "adamw_beta1": 0.0001,
            }
        )

@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
        cfg = DictDefault(
            {
                "optimizer": "adamw_bnb_8bit",
-                "adam_beta1": 0.9,
-                "adam_beta2": 0.99,
-                "adam_epsilon": 0.0001,
+                "adamw_beta1": 0.0001,
+                "adamw_beta2": 0.0001,
+                "adamw_epsilon": 0.0001,
            }
        )
Author	SHA1	Message	Date
Wing Lian	f6721baf10	tweak to make it work when we have no explicit test split Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-07-11 22:40:21 -04:00
Wing Lian	33814cc94e	make sure we eval for openorca	2023-07-02 17:59:10 -04:00
Wing Lian	50254a7ccc	handle orca splits	2023-07-01 07:20:23 -04:00