wip, jagged restarts

Add MPS support (#1264 )
* add mps support * linter stuff * CI fixes * install packaging for various tests * Update setup.py * Revert "install packaging for various tests" This reverts commit 980e7aa44d. * Revert "CI fixes" This reverts commit 4609e3b166. --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-02-16 14:34:08 -05:00 · 2024-02-12 08:30:32 -05:00 · 2024-02-09 14:54:31 -05:00 · 2024-02-09 07:38:08 -08:00 · 2024-02-09 10:32:54 -05:00 · 2024-02-08 20:02:17 -08:00
132 changed files with 5141 additions and 3753 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,6 +1,6 @@
 # These are supported funding model platforms

-github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+github: [winglian, OpenAccess-AI-Collective] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 patreon: # Replace with a single Patreon username
 open_collective: # Replace with a single Open Collective username
 ko_fi: axolotl_ai # Replace with a single Ko-fi username
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -1,25 +1,17 @@
 name: ci-cd-base

 on:
-  push:
-    branches:
-      - "main-base"
-      - "dev-base"
+  workflow_dispatch:

 jobs:
  build-base:
    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: self-hosted
+    runs-on: axolotl-gpu-runner
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: "118"
-            cuda_version: 11.8.0
-            python_version: "3.9"
-            pytorch: 2.0.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.10"
@@ -28,12 +20,17 @@ jobs:
          - cuda: "118"
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
          - cuda: "121"
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.1.2
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
    steps:
      - name: Checkout
@@ -56,7 +53,7 @@ jobs:
          context: .
          file: ./docker/Dockerfile-base
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,37 +4,37 @@ on:
  push:
    branches:
      - "main"
+  workflow_dispatch:

 jobs:
  build-axolotl:
-    if: github.repository_owner == 'OpenAccess-AI-Collective'
-    # this job needs to be run on self-hosted GPU runners...
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 118
            cuda_version: 11.8.0
-            python_version: "3.9"
+            python_version: "3.10"
            pytorch: 2.0.1
            axolotl_extras:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
+            pytorch: 2.1.2
            axolotl_extras:
            is_latest: true
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.10"
-            pytorch: 2.1.1
-            axolotl_extras:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
            axolotl_extras:
-    runs-on: [self-hosted, gpu, docker]
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.1.2
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -55,57 +55,46 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
-          load: true
          build-args: |
            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
          file: ./docker/Dockerfile
+          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
-      - name: Unit Tests
-        run: |
-          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
-      - name: Push to Docker Hub
-        if: github.event_name != 'pull_request'
-        run: |
-          docker push ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          latest_tag=${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          if [ -n "$latest_tag" ]; then
-            docker push "$latest_tag"
-          fi

  build-axolotl-runpod:
    needs: build-axolotl
-    if: github.repository_owner == 'OpenAccess-AI-Collective'
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
        include:
          - cuda: 118
            cuda_version: 11.8.0
-            python_version: "3.9"
+            python_version: "3.10"
            pytorch: 2.0.1
            axolotl_extras:
          - cuda: 118
            cuda_version: 11.8.0
            python_version: "3.10"
-            pytorch: 2.0.1
+            pytorch: 2.1.2
            axolotl_extras:
            is_latest: true
-          - cuda: 118
-            cuda_version: 11.8.0
-            python_version: "3.10"
-            pytorch: 2.1.1
-            axolotl_extras:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
            axolotl_extras:
-    runs-on: [self-hosted, gpu, docker]
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.1.2
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -73,7 +73,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.1
+            pytorch: 2.1.2
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -106,3 +106,7 @@ jobs:
      - name: GPU Unit Tests monkeypatched w docker image
        run: |
          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
+      - name: Prune image from docker
+        if: github.ref != 'refs/heads/main'
+        run: |
+          docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -32,6 +32,9 @@ ignore_missing_imports = True
 [mypy-bitsandbytes]
 ignore_missing_imports = True

+[mypy-requests]
+ignore_missing_imports = True
+
 [mypy-datasets]
 ignore_missing_imports = True

--- a/README.md
+++ b/README.md
@@ -25,8 +25,8 @@ Features:
 - [Installation](#installation)
  - [Docker](#docker)
  - [Conda/Pip venv](#condapip-venv)
-  - [Cloud GPU](#cloud-gpu) - Runpod, Latitude
-  - [LambdaLabs](#lambdalabs)
+  - [Cloud GPU](#cloud-gpu) - Latitude.sh, RunPod
+  - [Bare Metal Cloud GPU](#bare-metal-cloud-gpu)
  - [Windows](#windows)
  - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
 - [Dataset](#dataset)
@@ -37,6 +37,9 @@ Features:
  - [Inference](#inference)
  - [Merge LORA to Base](#merge-lora-to-base)
  - [Special Tokens](#special-tokens)
+- Advanced Topics
+  - [Multipack](./docs/multipack.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
+  - [RLHF & DPO](./docs/rlhf.md)<svg width="24" height="24" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><path d="M17 13.5v6H5v-12h6m3-3h6v6m0-6-9 9" class="icon_svg-stroke" stroke="#666" stroke-width="1.5" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg>
 - [Common Errors](#common-errors-)
  - [Tokenization Mismatch b/w Training & Inference](#tokenization-mismatch-bw-inference--training)
 - [Debugging Axolotl](#debugging-axolotl)
@@ -105,6 +108,9 @@ pip3 install -e '.[flash-attn,deepspeed]'

 ### Usage
 ```bash
+# preprocess datasets - optional but recommended
+CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/openllama-3b/lora.yml
+
 # finetune lora
 accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml

@@ -115,6 +121,10 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
 # gradio
 accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
    --lora_model_dir="./lora-out" --gradio
+
+# remote yaml files - the yaml config can be hosted on a public URL
+# Note: the yaml config must directly link to the **raw** yaml
+accelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/examples/openllama-3b/lora.yml
 ```

 ## Installation
@@ -176,9 +186,13 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --

 For cloud GPU providers that support docker images, use [`winglian/axolotl-cloud:main-latest`](https://hub.docker.com/r/winglian/axolotl-cloud/tags)

+- on Latitude.sh use this [direct link](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)
 - on RunPod use this [direct link](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)

-#### LambdaLabs
+#### Bare Metal Cloud GPU
+
+##### LambdaLabs
+
  <details>

  <summary>Click to Expand</summary>
@@ -458,14 +472,20 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
  dataset:
    - path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
      ...
+
+  # Loading Data From a Public URL
+  # - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
+  dataset:
+    - path: https://some.url.com/yourdata.jsonl # The URL should be a direct link to the file you wish to load. URLs must use HTTPS protocol, not HTTP.
+      ds_type: json # this is the default, see other options below.
  ```

 - loading
  ```yaml
  load_in_4bit: true
  load_in_8bit: true
-  bf16: true # require >=ampere
-  fp16: true
+  bf16: auto # require >=ampere, auto will detect if your GPU supports this and choose automatically.
+  fp16: # leave empty to use fp16 when bf16 is 'auto'. set to false if you want to fallback to fp32
  tf32: true # require >=ampere
  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
  float16: true # use instead of fp16 when you don't want AMP
@@ -604,12 +624,25 @@ datasets:
      # For `completion` datsets only, uses the provided field instead of `text` column
      field:

+# A list of one or more datasets to eval the model with.
+# You can use either test_datasets, or val_set_size, but not both.
+test_datasets:
+  - path: /workspace/data/eval.jsonl
+    ds_type: json
+    # You need to specify a split. For "json" datasets the default split is called "train".
+    split: train
+    type: completion
+    data_files:
+      - /workspace/data/eval.jsonl
+
 # use RL training: dpo, ipo, kto_pair
 rl:

 # Saves the desired chat template to the tokenizer_config.json for easier inferencing
 # Currently supports chatml and inst (mistral/mixtral)
 chat_template: chatml
+# Changes the default system message
+default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -618,6 +651,9 @@ push_dataset_to_hub: # repo path
 # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # if not set.
 dataset_processes: # defaults to os.cpu_count() if not set
+# Keep dataset in memory while preprocessing
+# Only needed if cached dataset is taking too much storage
+dataset_keep_in_memory:
 # push checkpoints to hub
 hub_model_id: # repo path to push finetuned model
 # how to push checkpoints to hub
@@ -639,10 +675,6 @@ sequence_len: 2048
 # Pad inputs so each step uses constant sized buffers
 # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
 pad_to_sequence_len:
-# Max sequence length to concatenate training samples together up to
-# Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
-# FutureWarning: This will soon be DEPRECATED
-max_packed_sequence_len: 1024
 # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
 sample_packing:
 # Set to 'false' if getting errors during eval with sample_packing on.
@@ -692,6 +724,12 @@ lora_modules_to_save:

 lora_fan_in_fan_out: false

+peft:
+  # Configuration options for loftq initialization for LoRA
+  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
+  loftq_config:
+    loftq_bits:  # typically 4 bits
+
 # ReLoRA configuration
 # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
 relora_steps: # Number of steps per ReLoRA restart
@@ -834,7 +872,8 @@ flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
 # Whether to use scaled-dot-product attention
 # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 sdp_attention:
-
+# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+s2_attention:
 # Resume from a specific checkpoint dir
 resume_from_checkpoint:
 # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
@@ -858,7 +897,7 @@ tokens:
 fsdp:
 fsdp_config:

-# Deepspeed config path. e.g., deepspeed/zero3.json
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
 deepspeed:

 # Advanced DDP Arguments
@@ -951,6 +990,9 @@ Run
 accelerate launch -m axolotl.cli.train your_config.yml
 ```

+> [!TIP]
+> You can also reference a config file that is hosted on a public URL, for example `accelerate launch -m axolotl.cli.train https://yourdomain.com/your_config.yml`
+
 #### Preprocess dataset

 You can optionally pre-tokenize dataset with the following before finetuning.
@@ -979,11 +1021,11 @@ for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usa
 We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.

 ```yaml
-deepspeed: deepspeed/zero1.json
+deepspeed: deepspeed_configs/zero1.json
 ```

 ```shell
-accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
+accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed_configs/zero1.json
 ```

 ##### FSDP
@@ -1122,7 +1164,7 @@ If you decode a prompt constructed by axolotl, you might see spaces between toke
 1. Materialize some data using `python -m axolotl.cli.preprocess your_config.yml --debug`, and then decode the first few rows with your model's tokenizer.
 2. During inference, right before you pass a tensor of token ids to your model, decode these tokens back into a string.
 3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines.  If they aren't the same adjust your inference server accordingly.
-4. As an additional troubleshooting step, you can look look at the token ids between 1 and 2 to make sure they are identical.
+4. As an additional troubleshooting step, you can look at the token ids between 1 and 2 to make sure they are identical.

 Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this.  See [this blog post](https://hamel.dev/notes/llm/05_tokenizer_gotchas.html) for a concrete example.

@@ -1130,9 +1172,11 @@ Having misalignment between your prompts during training and inference can cause

 See [this debugging guide](docs/debugging.md) for tips on debugging Axolotl, along with an example configuration for debugging with VSCode.

-## Need help? 🙋♂️
+## Need help? 🙋

-Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
+Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we our community members can help you.
+
+Need dedicated support? Please contact us at [✉️wing@openaccessaicollective.org](mailto:wing@openaccessaicollective.org) for dedicated support options.

 ## Badge ❤🏷️

@@ -1149,7 +1193,7 @@ Building something cool with Axolotl? Consider adding a badge to your model card
 Check out some of the projects and models that have been built using Axolotl! Have a model you'd like to add to our Community Showcase? Open a PR with your model.

 Open Access AI Collective
- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b)
+- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b-fixed)
 - [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
 - [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)

@@ -1173,6 +1217,12 @@ pre-commit install
 pytest tests/
 ```

+Thanks to all of our contributors to date. Help drive open source AI progress forward by contributing to Axolotl.
+
+<a href="https://github.com/openaccess-ai-collective/axolotl/graphs/contributors">
+  <img src="https://contrib.rocks/image?repo=openaccess-ai-collective/axolotl" alt="contributor chart by https://contrib.rocks"/>
+</a>
+
 ## Sponsors 🤝❤

 OpenAccess AI Collective is run by volunteer contributors such as [winglian](https://github.com/winglian),
--- a/deepspeed_configs/zero1.json
+++ b/deepspeed_configs/zero1.json
@@ -15,15 +15,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed_configs/zero2.json
+++ b/deepspeed_configs/zero2.json
@@ -19,15 +19,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
@@ -23,15 +23,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -23,15 +23,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} deepspeed-kernels --extra-index-url https://download.pytorch.org/whl/cu$CUDA
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA

 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -7,14 +7,19 @@ ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
 ENV HF_HOME="/workspace/data/huggingface-cache/hub"
 ENV HF_HUB_ENABLE_HF_TRANSFER="1"

-COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
+EXPOSE 8888
+EXPOSE 22

-RUN pip install jupyterlab notebook && \
+COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
+COPY scripts/motd /etc/motd
+
+RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
 RUN apt install --yes --no-install-recommends openssh-server tmux && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
+    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
    chmod +x /root/cloud-entrypoint.sh

--- a/docs/images/4d-mask.png
+++ b/docs/images/4d-mask.png
--- a/docs/multipack.md
+++ b/docs/multipack.md
@@ -1,4 +1,11 @@
-# Multipack
+# Multipack (Sample Packing)
+
+## Visualization of Multipack with Flash Attention
+
+Because Flash Attention simply drops the attention mask, we do not need to
+construct a 4d attention mask. We only need to concatenate the sequences into
+a single batch and let flash attention know where each new sequence begins.
+

 4k context, bsz =4,
 each character represents 256 tokens
@@ -49,3 +56,18 @@ w packing ( note it's the same effective number of tokens per step, but a true b
   E E E E F F F F F G G G H H H H
   I I I J J J J K K K K K L L L X ]]
 ```
+
+cu_seqlens:
+[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]
+
+
+## Multipack without Flash Attention
+
+Multipack can still be achieved without Flash attention, but with lower packing
+efficiency as we are not able to join multiple batches into a single batch due to
+context length limits without flash attention. We can use either Pytorch's Scaled
+Dot Product Attention implementation or native Pytorch attention implementation
+along with [4d attention masks](https://github.com/huggingface/transformers/pull/27539)
+to pack sequences together and avoid cross attention.
+
+<img src="./images/4d-mask.png" alt="axolotl" width="800">
--- a/docs/rlhf.md
+++ b/docs/rlhf.md
@@ -12,21 +12,21 @@ feedback. Various methods include, but not limited to:

 ### RLHF using Axolotl

-[!IMPORTANT]
-This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.
+>[!IMPORTANT]
+>This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.

 The various RL training methods are implemented in trl and wrapped via axolotl. Below are various examples with how you can use various preference datasets to train models that use ChatML

 #### DPO
 ```yaml
-rl: true
+rl: dpo
 datasets:
  - path: Intel/orca_dpo_pairs
    split: train
-    type: intel_apply_chatml
+    type: chatml.intel
  - path: argilla/ultrafeedback-binarized-preferences
    split: train
-    type: argilla_apply_chatml
+    type: chatml.argilla
 ```

 #### IPO
@@ -34,6 +34,16 @@ datasets:
 rl: ipo
 ```

+#### Using local dataset files
+```yaml
+datasets:
+  - ds_type: json
+    data_files:
+      - orca_rlhf.jsonl
+    split: train
+    type: chatml.intel
+```
+
 #### Trl autounwrap for peft

 Trl supports autounwrapping peft models, so that a ref model does not need to be additionally loaded, leading to less VRAM needed. This is on by default. To turn it off, pass the following config.
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -53,8 +53,8 @@ lr_quadratic_warmup: true
 learning_rate: 0.000085
 train_on_inputs: true
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: false
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -11,7 +11,6 @@ val_set_size: 0.05
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
-max_packed_sequence_len: 2048
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
@@ -36,8 +35,8 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,6 +52,7 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,6 +52,7 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,6 +52,7 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -0,0 +1,198 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AKjdG7tbTb-n"
+      },
+      "source": [
+        "# Example notebook for running Axolotl on google colab"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "RcbNpOgWRcii"
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n",
+        "# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
+        "assert (torch.cuda.is_available()==True)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "h3nLav8oTRA5"
+      },
+      "source": [
+        "## Install Axolotl and dependencies"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3c3yGAwnOIdi",
+        "outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install torch==\"2.1.2\"\n",
+        "!pip install -e git+https://github.com/OpenAccess-AI-Collective/axolotl#egg=axolotl\n",
+        "!pip install flash-attn==\"2.5.0\"\n",
+        "!pip install deepspeed==\"0.13.1\""
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BW2MFr7HTjub"
+      },
+      "source": [
+        "## Create an yaml config file"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "9pkF2dSoQEUN"
+      },
+      "outputs": [],
+      "source": [
+        "import yaml\n",
+        "\n",
+        "# Your YAML string\n",
+        "yaml_string = \"\"\"\n",
+        "base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
+        "model_type: LlamaForCausalLM\n",
+        "tokenizer_type: LlamaTokenizer\n",
+        "is_llama_derived_model: true\n",
+        "\n",
+        "load_in_8bit: false\n",
+        "load_in_4bit: true\n",
+        "strict: false\n",
+        "\n",
+        "datasets:\n",
+        "  - path: mhenrichsen/alpaca_2k_test\n",
+        "    type: alpaca\n",
+        "dataset_prepared_path:\n",
+        "val_set_size: 0.05\n",
+        "output_dir: ./qlora-out\n",
+        "\n",
+        "adapter: qlora\n",
+        "lora_model_dir:\n",
+        "\n",
+        "sequence_len: 1096\n",
+        "sample_packing: true\n",
+        "pad_to_sequence_len: true\n",
+        "\n",
+        "lora_r: 32\n",
+        "lora_alpha: 16\n",
+        "lora_dropout: 0.05\n",
+        "lora_target_modules:\n",
+        "lora_target_linear: true\n",
+        "lora_fan_in_fan_out:\n",
+        "\n",
+        "wandb_project:\n",
+        "wandb_entity:\n",
+        "wandb_watch:\n",
+        "wandb_name:\n",
+        "wandb_log_model:\n",
+        "\n",
+        "mlflow_experiment_name: colab-example\n",
+        "\n",
+        "gradient_accumulation_steps: 1\n",
+        "micro_batch_size: 1\n",
+        "num_epochs: 4\n",
+        "max_steps: 20\n",
+        "optimizer: paged_adamw_32bit\n",
+        "lr_scheduler: cosine\n",
+        "learning_rate: 0.0002\n",
+        "\n",
+        "train_on_inputs: false\n",
+        "group_by_length: false\n",
+        "bf16: false\n",
+        "fp16: true\n",
+        "tf32: false\n",
+        "\n",
+        "gradient_checkpointing: true\n",
+        "early_stopping_patience:\n",
+        "resume_from_checkpoint:\n",
+        "local_rank:\n",
+        "logging_steps: 1\n",
+        "xformers_attention:\n",
+        "flash_attention: false\n",
+        "\n",
+        "warmup_steps: 10\n",
+        "evals_per_epoch:\n",
+        "saves_per_epoch:\n",
+        "debug:\n",
+        "deepspeed:\n",
+        "weight_decay: 0.0\n",
+        "fsdp:\n",
+        "fsdp_config:\n",
+        "special_tokens:\n",
+        "\n",
+        "\"\"\"\n",
+        "\n",
+        "# Convert the YAML string to a Python dictionary\n",
+        "yaml_dict = yaml.safe_load(yaml_string)\n",
+        "\n",
+        "# Specify your file path\n",
+        "file_path = 'test_axolotl.yaml'\n",
+        "\n",
+        "# Write the YAML file\n",
+        "with open(file_path, 'w') as file:\n",
+        "    yaml.dump(yaml_dict, file)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "bidoj8YLTusD"
+      },
+      "source": [
+        "## Launch the training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "ydTI2Jk2RStU",
+        "outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
+      },
+      "outputs": [],
+      "source": [
+        "# Buy using the ! the comand will be executed as a bash command\n",
+        "!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -38,8 +38,8 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
@@ -60,5 +60,5 @@ fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
-  bos_token: ">>ABSTRACT<<"
+  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -64,8 +64,8 @@ lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row
@@ -89,5 +89,5 @@ fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
-  bos_token: ">>ABSTRACT<<"
+  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -38,8 +38,8 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
@@ -60,5 +60,5 @@ fsdp:
 fsdp_config:
 special_tokens:
  pad_token: "<|endoftext|>"
-  bos_token: ">>ABSTRACT<<"
+  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -33,8 +33,8 @@ lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true
 gradient_checkpointing: true
 early_stopping_patience:
--- a/examples/jeopardy-bot/config.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -31,7 +31,7 @@ lr_scheduler: cosine
 learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -62,11 +62,8 @@ evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
-deepspeed: #deepspeed/zero2.json # multi-gpu only
+deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -0,0 +1,70 @@
+base_model: NousResearch/Llama-2-7b-hf
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0.05
+output_dir: ./lora-out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+peft:
+  loftq_config:
+    loftq_bits: 4
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+s2_attention:
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_table_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -52,6 +52,7 @@ local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
+s2_attention:

 warmup_steps: 10
 evals_per_epoch: 4
@@ -64,6 +65,3 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -65,6 +65,3 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -47,8 +47,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -34,8 +34,8 @@ learning_rate: 5e-5
 train_on_inputs: false
 group_by_length: true

-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: false
--- a/examples/mistral/Mistral-7b-example/README.md
+++ b/examples/mistral/Mistral-7b-example/README.md
@@ -0,0 +1,12 @@
+# Description
+This repository presents an in-depth guide for fine-tuning Mistral-7b or any other compatible model using Axolotl, tailored specifically for chatbot development. It streamlines the process of fine-tuning and uploading the enhanced model to HuggingFace 🤗, thereby serving as an invaluable tool for developers in the AI and chatbot domain.
+
+**What’s Inside:**
+
+Beginner-Friendly Instructions: Comprehensive steps to guide you through fine-tuning your chosen model, including details on the data structure (jsonl), configuration, and the code itself.
+
+Hardware Utilized: For reference, the fine-tuning in this guide was performed using 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel).
+
+**Uploading to HuggingFace 🤗:**
+To upload your fine-tuned model to Hugging Face, include the following files:
+![Screenshot 2024-01-19 213932](https://github.com/OpenAccess-AI-Collective/axolotl/assets/138583191/d660eb84-2d76-46a1-9846-cf0aeb3006d9)
--- a/examples/mistral/Mistral-7b-example/code.ipynb
+++ b/examples/mistral/Mistral-7b-example/code.ipynb
@@ -0,0 +1,970 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3fe31229-8f6b-48bc-a86d-af8e5466d11c",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GPU available? True\n",
+      "BF16 is supported? True\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Check if GPU is available I used 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel)\n",
+    "import torch\n",
+    "print('GPU available?', torch.cuda.is_available())\n",
+    "print('BF16 is supported?', torch.cuda.is_bf16_supported())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "1dee845b-f3cb-4b1e-bdd9-1a918eac140b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting huggingface_hub\n",
+      "  Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n",
+      "Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.7.1)\n",
+      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n",
+      "Downloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
+      "\u001b[?25hInstalling collected packages: huggingface_hub\n",
+      "Successfully installed huggingface_hub-0.20.1\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install huggingface_hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "88731672-9050-4034-8266-11aaace2a44e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "6b5aa7d7-3b18-4c14-afd4-043c2c545259",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "60df98d7b0294289aad8b6c8cd023c3b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "#Login to huggingface so you can push the model to hub later\n",
+    "import sys\n",
+    "stdout = sys.stdout\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b74d0635-5033-4494-b7bd-ff6822103d93",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#I noticed that when you use notebook_login() nothing gets printed after so we use sys \n",
+    "sys.stdout = stdout"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "e3c3b088-45e7-484b-ae39-66beabc48da8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cloning into 'axolotl'...\n",
+      "remote: Enumerating objects: 235, done.\u001b[K\n",
+      "remote: Counting objects: 100% (235/235), done.\u001b[K\n",
+      "remote: Compressing objects: 100% (207/207), done.\u001b[K\n",
+      "remote: Total 235 (delta 48), reused 123 (delta 13), pack-reused 0\u001b[K\n",
+      "Receiving objects: 100% (235/235), 1.46 MiB | 11.65 MiB/s, done.\n",
+      "Resolving deltas: 100% (48/48), done.\n"
+     ]
+    }
+   ],
+   "source": [
+    "#axolotl\n",
+    "!git clone -b main --depth 1 https://github.com/OpenAccess-AI-Collective/axolotl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "66927751-4fd6-4477-97fc-6ab08c9d9a74",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/axolotl\n"
+     ]
+    }
+   ],
+   "source": [
+    "cd axolotl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "fcccf8da-353b-4d70-8f55-5cfe08c7e6b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mObtaining file:///axolotl\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting auto-gptq==0.5.1\n",
+      "  Downloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
+      "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
+      "Collecting peft==0.6.0\n",
+      "  Downloading peft-0.6.0-py3-none-any.whl.metadata (23 kB)\n",
+      "Collecting transformers==4.36.2\n",
+      "  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.8/126.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting tokenizers==0.15.0\n",
+      "  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
+      "Collecting bitsandbytes>=0.41.1\n",
+      "  Downloading bitsandbytes-0.41.3.post2-py3-none-any.whl.metadata (9.8 kB)\n",
+      "Collecting accelerate==0.24.1\n",
+      "  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting addict\n",
+      "  Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
+      "Collecting fire\n",
+      "  Downloading fire-0.5.0.tar.gz (88 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.3/88.3 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.10/site-packages (6.0.1)\n",
+      "Collecting datasets>=2.15.0\n",
+      "  Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting sentencepiece\n",
+      "  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting wandb\n",
+      "  Downloading wandb-0.16.1-py3-none-any.whl.metadata (9.8 kB)\n",
+      "Collecting einops\n",
+      "  Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)\n",
+      "Collecting optimum==1.13.2\n",
+      "  Downloading optimum-1.13.2.tar.gz (300 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m72.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting hf_transfer\n",
+      "  Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Collecting colorama\n",
+      "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+      "Collecting numba\n",
+      "  Downloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n",
+      "Requirement already satisfied: numpy>=1.24.4 in /opt/conda/lib/python3.10/site-packages (1.26.0)\n",
+      "Collecting bert-score==0.3.13\n",
+      "  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting evaluate==0.4.0\n",
+      "  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m26.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rouge-score==0.1.2\n",
+      "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting scipy\n",
+      "  Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.4/60.4 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting scikit-learn==1.2.2\n",
+      "  Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
+      "\u001b[?25hCollecting pynvml\n",
+      "  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting art\n",
+      "  Downloading art-6.1-py3-none-any.whl.metadata (69 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.9/69.9 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting fschat==0.2.34\n",
+      "  Downloading fschat-0.2.34-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting gradio==3.50.2\n",
+      "  Downloading gradio-3.50.2-py3-none-any.whl.metadata (17 kB)\n",
+      "Collecting tensorboard\n",
+      "  Downloading tensorboard-2.15.1-py3-none-any.whl.metadata (1.7 kB)\n",
+      "Collecting s3fs\n",
+      "  Downloading s3fs-2023.12.2-py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting gcsfs\n",
+      "  Downloading gcsfs-2023.12.2.post1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting xformers==0.0.23\n",
+      "  Downloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)\n",
+      "Collecting deepspeed\n",
+      "  Downloading deepspeed-0.12.6.tar.gz (1.2 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m109.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting flash-attn==2.3.3\n",
+      "  Downloading flash_attn-2.3.3.tar.gz (2.3 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m111.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (5.9.0)\n",
+      "Requirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (2.1.1)\n",
+      "Requirement already satisfied: huggingface-hub in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (0.20.1)\n",
+      "Collecting rouge (from auto-gptq==0.5.1)\n",
+      "  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)\n",
+      "Collecting gekko (from auto-gptq==0.5.1)\n",
+      "  Downloading gekko-1.0.6-py3-none-any.whl (12.2 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m77.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hCollecting safetensors (from auto-gptq==0.5.1)\n",
+      "  Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from auto-gptq==0.5.1) (4.65.0)\n",
+      "Collecting pandas>=1.0.1 (from bert-score==0.3.13)\n",
+      "  Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from bert-score==0.3.13) (2.31.0)\n",
+      "Collecting matplotlib (from bert-score==0.3.13)\n",
+      "  Downloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
+      "Collecting dill (from evaluate==0.4.0)\n",
+      "  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n",
+      "Collecting xxhash (from evaluate==0.4.0)\n",
+      "  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+      "Collecting multiprocess (from evaluate==0.4.0)\n",
+      "  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n",
+      "Requirement already satisfied: fsspec>=2021.05.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]>=2021.05.0->evaluate==0.4.0) (2023.10.0)\n",
+      "Collecting responses<0.19 (from evaluate==0.4.0)\n",
+      "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
+      "Collecting ninja (from flash-attn==2.3.3)\n",
+      "  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)\n",
+      "Collecting aiohttp (from fschat==0.2.34)\n",
+      "  Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n",
+      "Collecting fastapi (from fschat==0.2.34)\n",
+      "  Downloading fastapi-0.108.0-py3-none-any.whl.metadata (24 kB)\n",
+      "Collecting httpx (from fschat==0.2.34)\n",
+      "  Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)\n",
+      "Collecting markdown2[all] (from fschat==0.2.34)\n",
+      "  Downloading markdown2-2.4.12-py2.py3-none-any.whl.metadata (2.0 kB)\n",
+      "Collecting nh3 (from fschat==0.2.34)\n",
+      "  Downloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)\n",
+      "Requirement already satisfied: prompt-toolkit>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from fschat==0.2.34) (3.0.36)\n",
+      "Collecting pydantic<2,>=1 (from fschat==0.2.34)\n",
+      "  Downloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (149 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rich>=10.0.0 (from fschat==0.2.34)\n",
+      "  Downloading rich-13.7.0-py3-none-any.whl.metadata (18 kB)\n",
+      "Collecting shortuuid (from fschat==0.2.34)\n",
+      "  Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)\n",
+      "Collecting tiktoken (from fschat==0.2.34)\n",
+      "  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting uvicorn (from fschat==0.2.34)\n",
+      "  Downloading uvicorn-0.25.0-py3-none-any.whl.metadata (6.4 kB)\n",
+      "Collecting aiofiles<24.0,>=22.0 (from gradio==3.50.2)\n",
+      "  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n",
+      "Collecting altair<6.0,>=4.2.0 (from gradio==3.50.2)\n",
+      "  Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)\n",
+      "Collecting ffmpy (from gradio==3.50.2)\n",
+      "  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting gradio-client==0.6.1 (from gradio==3.50.2)\n",
+      "  Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)\n",
+      "Collecting importlib-resources<7.0,>=1.3 (from gradio==3.50.2)\n",
+      "  Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)\n",
+      "Requirement already satisfied: jinja2<4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (3.1.2)\n",
+      "Requirement already satisfied: markupsafe~=2.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (2.1.1)\n",
+      "Collecting orjson~=3.0 (from gradio==3.50.2)\n",
+      "  Downloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: pillow<11.0,>=8.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (10.0.1)\n",
+      "Collecting pydub (from gradio==3.50.2)\n",
+      "  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
+      "Collecting python-multipart (from gradio==3.50.2)\n",
+      "  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting semantic-version~=2.0 (from gradio==3.50.2)\n",
+      "  Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
+      "Requirement already satisfied: typing-extensions~=4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (4.7.1)\n",
+      "Collecting websockets<12.0,>=10.0 (from gradio==3.50.2)\n",
+      "  Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting coloredlogs (from optimum==1.13.2)\n",
+      "  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from optimum==1.13.2) (1.11.1)\n",
+      "Collecting absl-py (from rouge-score==0.1.2)\n",
+      "  Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)\n",
+      "Collecting nltk (from rouge-score==0.1.2)\n",
+      "  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m90.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: six>=1.14.0 in /opt/conda/lib/python3.10/site-packages (from rouge-score==0.1.2) (1.16.0)\n",
+      "Collecting joblib>=1.1.1 (from scikit-learn==1.2.2)\n",
+      "  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n",
+      "Collecting threadpoolctl>=2.0.0 (from scikit-learn==1.2.2)\n",
+      "  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers==4.36.2) (3.9.0)\n",
+      "Collecting regex!=2019.12.17 (from transformers==4.36.2)\n",
+      "  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate==0.24.1) (3.1)\n",
+      "Collecting pyarrow>=8.0.0 (from datasets>=2.15.0)\n",
+      "  Downloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)\n",
+      "Collecting pyarrow-hotfix (from datasets>=2.15.0)\n",
+      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n",
+      "Collecting hjson (from deepspeed)\n",
+      "  Downloading hjson-3.1.0-py3-none-any.whl (54 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.0/54.0 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting py-cpuinfo (from deepspeed)\n",
+      "  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n",
+      "Collecting termcolor (from fire)\n",
+      "  Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.10/site-packages (from gcsfs) (5.1.1)\n",
+      "INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.\n",
+      "Collecting gcsfs\n",
+      "  Downloading gcsfs-2023.12.1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "  Downloading gcsfs-2023.12.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "  Downloading gcsfs-2023.10.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting google-auth>=1.2 (from gcsfs)\n",
+      "  Downloading google_auth-2.25.2-py2.py3-none-any.whl.metadata (4.7 kB)\n",
+      "Collecting google-auth-oauthlib (from gcsfs)\n",
+      "  Downloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl.metadata (2.7 kB)\n",
+      "Collecting google-cloud-storage (from gcsfs)\n",
+      "  Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl.metadata (6.1 kB)\n",
+      "Collecting llvmlite<0.42,>=0.41.0dev0 (from numba)\n",
+      "  Downloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n",
+      "Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)\n",
+      "  Downloading aiobotocore-2.9.0-py3-none-any.whl.metadata (20 kB)\n",
+      "INFO: pip is looking at multiple versions of s3fs to determine which version is compatible with other requirements. This could take a while.\n",
+      "Collecting s3fs\n",
+      "  Downloading s3fs-2023.12.1-py3-none-any.whl.metadata (1.6 kB)\n",
+      "  Downloading s3fs-2023.10.0-py3-none-any.whl.metadata (1.6 kB)\n",
+      "Collecting aiobotocore~=2.7.0 (from s3fs)\n",
+      "  Downloading aiobotocore-2.7.0-py3-none-any.whl.metadata (20 kB)\n",
+      "Collecting grpcio>=1.48.2 (from tensorboard)\n",
+      "  Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)\n",
+      "Collecting markdown>=2.6.8 (from tensorboard)\n",
+      "  Downloading Markdown-3.5.1-py3-none-any.whl.metadata (7.1 kB)\n",
+      "Collecting protobuf<4.24,>=3.19.6 (from tensorboard)\n",
+      "  Downloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (540 bytes)\n",
+      "Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard) (68.0.0)\n",
+      "Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)\n",
+      "  Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)\n",
+      "Collecting werkzeug>=1.0.1 (from tensorboard)\n",
+      "  Downloading werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.1 in /opt/conda/lib/python3.10/site-packages (from wandb) (8.1.7)\n",
+      "Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)\n",
+      "  Downloading GitPython-3.1.40-py3-none-any.whl.metadata (12 kB)\n",
+      "Collecting sentry-sdk>=1.0.0 (from wandb)\n",
+      "  Downloading sentry_sdk-1.39.1-py2.py3-none-any.whl.metadata (9.7 kB)\n",
+      "Collecting docker-pycreds>=0.4.0 (from wandb)\n",
+      "  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
+      "Collecting setproctitle (from wandb)\n",
+      "  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)\n",
+      "Collecting appdirs>=1.4.3 (from wandb)\n",
+      "  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n",
+      "Collecting botocore<1.31.65,>=1.31.16 (from aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading botocore-1.31.64-py3-none-any.whl.metadata (6.1 kB)\n",
+      "Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+      "Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->fschat==0.2.34) (23.1.0)\n",
+      "Collecting multidict<7.0,>=4.5 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting yarl<2.0,>=1.0 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)\n",
+      "Collecting frozenlist>=1.1.1 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
+      "Collecting aiosignal>=1.1.2 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
+      "Collecting async-timeout<5.0,>=4.0 (from aiohttp->fschat==0.2.34)\n",
+      "  Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)\n",
+      "Requirement already satisfied: jsonschema>=3.0 in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (4.20.0)\n",
+      "Requirement already satisfied: toolz in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (0.12.0)\n",
+      "Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)\n",
+      "  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n",
+      "Collecting cachetools<6.0,>=2.0.0 (from google-auth>=1.2->gcsfs)\n",
+      "  Downloading cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)\n",
+      "Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.2->gcsfs)\n",
+      "  Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.3/181.3 kB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting rsa<5,>=3.1.4 (from google-auth>=1.2->gcsfs)\n",
+      "  Downloading rsa-4.9-py3-none-any.whl (34 kB)\n",
+      "Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib->gcsfs)\n",
+      "  Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n",
+      "Collecting contourpy>=1.0.1 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
+      "Collecting cycler>=0.10 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
+      "Collecting fonttools>=4.22.0 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (157 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m157.2/157.2 kB\u001b[0m \u001b[31m41.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.4 kB)\n",
+      "Collecting pyparsing>=2.3.1 (from matplotlib->bert-score==0.3.13)\n",
+      "  Downloading pyparsing-3.1.1-py3-none-any.whl.metadata (5.1 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->bert-score==0.3.13) (2.8.2)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.1->bert-score==0.3.13) (2023.3.post1)\n",
+      "Collecting tzdata>=2022.1 (from pandas>=1.0.1->bert-score==0.3.13)\n",
+      "  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: wcwidth in /opt/conda/lib/python3.10/site-packages (from prompt-toolkit>=3.0.0->fschat==0.2.34) (0.2.5)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2023.7.22)\n",
+      "Collecting markdown-it-py>=2.2.0 (from rich>=10.0.0->fschat==0.2.34)\n",
+      "  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich>=10.0.0->fschat==0.2.34) (2.15.1)\n",
+      "Collecting h11>=0.8 (from uvicorn->fschat==0.2.34)\n",
+      "  Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->optimum==1.13.2)\n",
+      "  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting starlette<0.33.0,>=0.29.0 (from fastapi->fschat==0.2.34)\n",
+      "  Downloading starlette-0.32.0.post1-py3-none-any.whl.metadata (5.8 kB)\n",
+      "Collecting typing-extensions~=4.0 (from gradio==3.50.2)\n",
+      "  Downloading typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)\n",
+      "Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_api_core-2.15.0-py3-none-any.whl.metadata (2.7 kB)\n",
+      "Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)\n",
+      "Collecting google-resumable-media>=2.6.0 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl.metadata (2.2 kB)\n",
+      "Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage->gcsfs)\n",
+      "  Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\n",
+      "Requirement already satisfied: anyio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (4.2.0)\n",
+      "Collecting httpcore==1.* (from httpx->fschat==0.2.34)\n",
+      "  Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)\n",
+      "Requirement already satisfied: sniffio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (1.3.0)\n",
+      "Collecting wavedrom (from markdown2[all]->fschat==0.2.34)\n",
+      "  Downloading wavedrom-2.0.3.post3.tar.gz (137 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.7/137.7 kB\u001b[0m \u001b[31m47.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->optimum==1.13.2) (1.3.0)\n",
+      "Collecting jmespath<2.0.0,>=0.7.1 (from botocore<1.31.65,>=1.31.16->aiobotocore~=2.7.0->s3fs)\n",
+      "  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
+      "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)\n",
+      "  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n",
+      "Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage->gcsfs)\n",
+      "  Downloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl.metadata (1.5 kB)\n",
+      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (2023.12.1)\n",
+      "Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.32.0)\n",
+      "Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.15.2)\n",
+      "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.0.0->fschat==0.2.34)\n",
+      "  Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
+      "Collecting pyasn1<0.6.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs)\n",
+      "  Downloading pyasn1-0.5.1-py2.py3-none-any.whl.metadata (8.6 kB)\n",
+      "Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs)\n",
+      "  Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m50.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: exceptiongroup>=1.0.2 in /opt/conda/lib/python3.10/site-packages (from anyio->httpx->fschat==0.2.34) (1.0.4)\n",
+      "Collecting svgwrite (from wavedrom->markdown2[all]->fschat==0.2.34)\n",
+      "  Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.1/67.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m89.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading fschat-0.2.34-py3-none-any.whl (220 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m220.1/220.1 kB\u001b[0m \u001b[31m63.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading gradio-3.50.2-py3-none-any.whl (20.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.3/20.3 MB\u001b[0m \u001b[31m82.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading peft-0.6.0-py3-none-any.whl (134 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.9/134.9 kB\u001b[0m \u001b[31m40.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m90.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl (213.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 MB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading gradio_client-0.6.1-py3-none-any.whl (299 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m299.2/299.2 kB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading bitsandbytes-0.41.3.post2-py3-none-any.whl (92.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading datasets-2.16.0-py3-none-any.whl (507 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m87.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.4/36.4 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading art-6.1-py3-none-any.whl (599 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m599.8/599.8 kB\u001b[0m \u001b[31m96.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading gcsfs-2023.10.0-py2.py3-none-any.whl (33 kB)\n",
+      "Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m99.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading s3fs-2023.10.0-py3-none-any.whl (28 kB)\n",
+      "Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m96.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading wandb-0.16.1-py3-none-any.whl (2.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m99.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading absl_py-2.0.0-py3-none-any.whl (130 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading aiobotocore-2.7.0-py3-none-any.whl (73 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.5/73.5 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
+      "Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading altair-5.2.0-py3-none-any.whl (996 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m996.9/996.9 kB\u001b[0m \u001b[31m110.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading GitPython-3.1.40-py3-none-any.whl (190 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.6/190.6 kB\u001b[0m \u001b[31m47.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_auth-2.25.2-py2.py3-none-any.whl (184 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m184.2/184.2 kB\u001b[0m \u001b[31m44.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl (24 kB)\n",
+      "Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m102.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading importlib_resources-6.1.1-py3-none-any.whl (33 kB)\n",
+      "Downloading joblib-1.3.2-py3-none-any.whl (302 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.2/302.2 kB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 MB\u001b[0m \u001b[31m74.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading Markdown-3.5.1-py3-none-any.whl (102 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.2/102.2 kB\u001b[0m \u001b[31m34.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m99.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m96.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.5/304.5 kB\u001b[0m \u001b[31m68.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.0/38.0 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m95.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.0/774.0 kB\u001b[0m \u001b[31m116.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading rich-13.7.0-py3-none-any.whl (240 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m240.6/240.6 kB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m102.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading sentry_sdk-1.39.1-py2.py3-none-any.whl (254 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m254.1/254.1 kB\u001b[0m \u001b[31m71.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl (6.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)\n",
+      "Downloading uvicorn-0.25.0-py3-none-any.whl (60 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.7/226.7 kB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading fastapi-0.108.0-py3-none-any.whl (92 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
+      "Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl (121 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 kB\u001b[0m \u001b[31m36.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m48.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m108.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.2/307.2 kB\u001b[0m \u001b[31m66.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+      "Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
+      "Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n",
+      "Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m101.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m44.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n",
+      "Downloading botocore-1.31.64-py3-none-any.whl (11.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/11.3 MB\u001b[0m \u001b[31m98.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
+      "\u001b[?25hDownloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\n",
+      "Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (310 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.7/310.7 kB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
+      "Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m102.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hDownloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.5/239.5 kB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.0/122.0 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\n",
+      "Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.6/80.6 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyparsing-3.1.1-py3-none-any.whl (103 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.1/103.1 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading starlette-0.32.0.post1-py3-none-any.whl (70 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.0/70.0 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (80 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.3/80.3 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.6/301.6 kB\u001b[0m \u001b[31m80.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading markdown2-2.4.12-py2.py3-none-any.whl (41 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.2/41.2 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl (228 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m228.7/228.7 kB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.9/84.9 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
+      "Building wheels for collected packages: flash-attn, optimum, rouge-score, deepspeed, fire, ffmpy, wavedrom\n",
+      "  Building wheel for flash-attn (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for flash-attn: filename=flash_attn-2.3.3-cp310-cp310-linux_x86_64.whl size=57042553 sha256=b1df92cb5bd7657d38b789dd48e907aa3e0bd2715c817eb85f3c4320bb11fb3f\n",
+      "  Stored in directory: /root/.cache/pip/wheels/e5/e6/fa/941802ec61d1afd320d27160ab1db98e6dba65381f84b76d4a\n",
+      "  Building wheel for optimum (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for optimum: filename=optimum-1.13.2-py3-none-any.whl size=395599 sha256=ff3a73120e1b6eeeda28f76e3fc8cd4cd826e5d66c869b7848ba150e7af79c62\n",
+      "  Stored in directory: /root/.cache/pip/wheels/6e/b7/2c/79405d98f0943373d8546daeae25a3d377f7659ca0cbe48699\n",
+      "  Building wheel for rouge-score (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=8118ecbbcd3529085e794c803f0ddb182fc6c6d3e8a494103b49a94abf1bec37\n",
+      "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
+      "  Building wheel for deepspeed (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for deepspeed: filename=deepspeed-0.12.6-py3-none-any.whl size=1306729 sha256=35c46b6f0275b0d3063522e0af4f3cbd9ec1c310114d8917d87cbe2bf43346e2\n",
+      "  Stored in directory: /root/.cache/pip/wheels/a3/dc/a2/f585faaed4dec84108916dcc8e8a7c129a216df8202ca32984\n",
+      "  Building wheel for fire (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for fire: filename=fire-0.5.0-py2.py3-none-any.whl size=116934 sha256=e76d5185f237f34ec69bb8aa657497bef07408978e4f7efdaef48663bb8cd4ef\n",
+      "  Stored in directory: /root/.cache/pip/wheels/90/d4/f7/9404e5db0116bd4d43e5666eaa3e70ab53723e1e3ea40c9a95\n",
+      "  Building wheel for ffmpy (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5579 sha256=da3b54dc0ac1a825a1a233315970ac80b8b4c53ebd9cb2a2cfdeab118f453a64\n",
+      "  Stored in directory: /root/.cache/pip/wheels/01/a6/d1/1c0828c304a4283b2c1639a09ad86f83d7c487ef34c6b4a1bf\n",
+      "  Building wheel for wavedrom (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for wavedrom: filename=wavedrom-2.0.3.post3-py2.py3-none-any.whl size=30052 sha256=7f0cbd15d63ee9c120190bac122ab51bbbfc91ee374bc3c046fadb320816c17e\n",
+      "  Stored in directory: /root/.cache/pip/wheels/9c/52/8c/38b454b42f712f325e26f633287484c7dc1ad469e1580c5954\n",
+      "Successfully built flash-attn optimum rouge-score deepspeed fire ffmpy wavedrom\n",
+      "Installing collected packages: sentencepiece, pydub, py-cpuinfo, ninja, nh3, hjson, ffmpy, bitsandbytes, appdirs, addict, xxhash, wrapt, werkzeug, websockets, tzdata, typing-extensions, threadpoolctl, termcolor, tensorboard-data-server, svgwrite, smmap, shortuuid, setproctitle, sentry-sdk, semantic-version, scipy, safetensors, rouge, regex, python-multipart, pyparsing, pynvml, pyasn1, pyarrow-hotfix, pyarrow, protobuf, orjson, oauthlib, multidict, mdurl, markdown2, markdown, llvmlite, kiwisolver, joblib, jmespath, importlib-resources, humanfriendly, hf_transfer, h11, grpcio, google-crc32c, gekko, frozenlist, fonttools, einops, docker-pycreds, dill, cycler, contourpy, colorama, cachetools, async-timeout, art, aioitertools, aiofiles, absl-py, yarl, wavedrom, uvicorn, tiktoken, scikit-learn, rsa, responses, requests-oauthlib, pydantic, pyasn1-modules, pandas, numba, nltk, multiprocess, matplotlib, markdown-it-py, httpcore, googleapis-common-protos, google-resumable-media, gitdb, fire, coloredlogs, botocore, aiosignal, xformers, tokenizers, starlette, rouge-score, rich, httpx, google-auth, GitPython, flash-attn, deepspeed, aiohttp, accelerate, wandb, transformers, gradio-client, google-auth-oauthlib, google-api-core, fastapi, altair, aiobotocore, tensorboard, s3fs, peft, gradio, google-cloud-core, fschat, datasets, bert-score, optimum, google-cloud-storage, evaluate, auto-gptq, gcsfs, axolotl\n",
+      "  Attempting uninstall: typing-extensions\n",
+      "    Found existing installation: typing_extensions 4.7.1\n",
+      "    Uninstalling typing_extensions-4.7.1:\n",
+      "      Successfully uninstalled typing_extensions-4.7.1\n",
+      "  Running setup.py develop for axolotl\n",
+      "Successfully installed GitPython-3.1.40 absl-py-2.0.0 accelerate-0.24.1 addict-2.4.0 aiobotocore-2.7.0 aiofiles-23.2.1 aiohttp-3.9.1 aioitertools-0.11.0 aiosignal-1.3.1 altair-5.2.0 appdirs-1.4.4 art-6.1 async-timeout-4.0.3 auto-gptq-0.5.1 axolotl-0.3.0 bert-score-0.3.13 bitsandbytes-0.41.3.post2 botocore-1.31.64 cachetools-5.3.2 colorama-0.4.6 coloredlogs-15.0.1 contourpy-1.2.0 cycler-0.12.1 datasets-2.16.0 deepspeed-0.12.6 dill-0.3.7 docker-pycreds-0.4.0 einops-0.7.0 evaluate-0.4.0 fastapi-0.108.0 ffmpy-0.3.1 fire-0.5.0 flash-attn-2.3.3 fonttools-4.47.0 frozenlist-1.4.1 fschat-0.2.34 gcsfs-2023.10.0 gekko-1.0.6 gitdb-4.0.11 google-api-core-2.15.0 google-auth-2.25.2 google-auth-oauthlib-1.2.0 google-cloud-core-2.4.1 google-cloud-storage-2.14.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 googleapis-common-protos-1.62.0 gradio-3.50.2 gradio-client-0.6.1 grpcio-1.60.0 h11-0.14.0 hf_transfer-0.1.4 hjson-3.1.0 httpcore-1.0.2 httpx-0.26.0 humanfriendly-10.0 importlib-resources-6.1.1 jmespath-1.0.1 joblib-1.3.2 kiwisolver-1.4.5 llvmlite-0.41.1 markdown-3.5.1 markdown-it-py-3.0.0 markdown2-2.4.12 matplotlib-3.8.2 mdurl-0.1.2 multidict-6.0.4 multiprocess-0.70.15 nh3-0.2.15 ninja-1.11.1.1 nltk-3.8.1 numba-0.58.1 oauthlib-3.2.2 optimum-1.13.2 orjson-3.9.10 pandas-2.1.4 peft-0.6.0 protobuf-4.23.4 py-cpuinfo-9.0.0 pyarrow-14.0.2 pyarrow-hotfix-0.6 pyasn1-0.5.1 pyasn1-modules-0.3.0 pydantic-1.10.13 pydub-0.25.1 pynvml-11.5.0 pyparsing-3.1.1 python-multipart-0.0.6 regex-2023.12.25 requests-oauthlib-1.3.1 responses-0.18.0 rich-13.7.0 rouge-1.0.1 rouge-score-0.1.2 rsa-4.9 s3fs-2023.10.0 safetensors-0.4.1 scikit-learn-1.2.2 scipy-1.11.4 semantic-version-2.10.0 sentencepiece-0.1.99 sentry-sdk-1.39.1 setproctitle-1.3.3 shortuuid-1.0.11 smmap-5.0.1 starlette-0.32.0.post1 svgwrite-1.4.3 tensorboard-2.15.1 tensorboard-data-server-0.7.2 termcolor-2.4.0 threadpoolctl-3.2.0 tiktoken-0.5.2 tokenizers-0.15.0 transformers-4.36.2 typing-extensions-4.8.0 tzdata-2023.3 uvicorn-0.25.0 wandb-0.16.1 wavedrom-2.0.3.post3 websockets-11.0.3 werkzeug-3.0.1 wrapt-1.16.0 xformers-0.0.23 xxhash-3.4.1 yarl-1.9.4\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0mCollecting git+https://github.com/huggingface/peft.git\n",
+      "  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-hka8xgk2\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-hka8xgk2\n",
+      "  Resolved https://github.com/huggingface/peft.git to commit cf04d0353f0343cbf66627228c4495f51669af34\n",
+      "  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (1.26.0)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (23.1)\n",
+      "Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (5.9.0)\n",
+      "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (6.0.1)\n",
+      "Requirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (2.1.1)\n",
+      "Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.36.2)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.65.0)\n",
+      "Requirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.24.1)\n",
+      "Requirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.4.1)\n",
+      "Requirement already satisfied: huggingface-hub>=0.17.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.20.1)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.9.0)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.10.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.31.0)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (4.8.0)\n",
+      "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (1.11.1)\n",
+      "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1)\n",
+      "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1.2)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (2023.12.25)\n",
+      "Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (0.15.0)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft==0.7.2.dev0) (2.1.1)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.0.4)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (1.26.18)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.7.22)\n",
+      "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft==0.7.2.dev0) (1.3.0)\n",
+      "Building wheels for collected packages: peft\n",
+      "  Building wheel for peft (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for peft: filename=peft-0.7.2.dev0-py3-none-any.whl size=169456 sha256=4c70d23e759fa6abb3827fb2f3a8683be3b24d78777d0f403bbc2c0548e5dd4b\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-my5ncou6/wheels/d7/c7/de/1368fac8590e1b103ddc2ec2a28ad51d83aded1a3830e8a087\n",
+      "Successfully built peft\n",
+      "Installing collected packages: peft\n",
+      "  Attempting uninstall: peft\n",
+      "    Found existing installation: peft 0.6.0\n",
+      "    Uninstalling peft-0.6.0:\n",
+      "      Successfully uninstalled peft-0.6.0\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "axolotl 0.3.0 requires peft==0.6.0, but you have peft 0.7.2.dev0 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed peft-0.7.2.dev0\n",
+      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "#instaling what is needed inside axolotl file\n",
+    "!pip install packaging\n",
+    "!pip install -e '.[flash-attn,deepspeed]'\n",
+    "!pip install -U git+https://github.com/huggingface/peft.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "82d1a380-1e87-48fe-89fe-25331326014d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The following values were not passed to `accelerate launch` and had defaults used instead:\n",
+      "\t`--num_processes` was set to a value of `3`\n",
+      "\t\tMore than one GPU was found, enabling multi-GPU training.\n",
+      "\t\tIf this was unintended please pass in `--num_processes=1`.\n",
+      "\t`--num_machines` was set to a value of `1`\n",
+      "\t`--mixed_precision` was set to a value of `'no'`\n",
+      "\t`--dynamo_backend` was set to a value of `'no'`\n",
+      "To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.\n",
+      "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "[2023-12-28 15:44:09,979] [INFO] [datasets.<module>:58] [PID:2814] PyTorch version 2.1.1 available.\n",
+      "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
+      "  warnings.warn(\n",
+      "[2023-12-28 15:44:10,011] [INFO] [datasets.<module>:58] [PID:2812] PyTorch version 2.1.1 available.\n",
+      "[2023-12-28 15:44:10,013] [INFO] [datasets.<module>:58] [PID:2813] PyTorch version 2.1.1 available.\n",
+      "[2023-12-28 15:44:10,805] [INFO] [axolotl.normalize_config:150] [PID:2814] [RANK:2] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:10,830] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2023-12-28 15:44:10,842] [INFO] [axolotl.normalize_config:150] [PID:2813] [RANK:1] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:10,865] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2023-12-28 15:44:10,869] [INFO] [axolotl.normalize_config:150] [PID:2812] [RANK:0] GPU memory usage baseline: 0.000GB (+0.351GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:10,887] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
+      "[2023-12-28 15:44:10,961] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+      "[2023-12-28 15:44:10,994] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+      "[2023-12-28 15:44:11,015] [INFO] [comm.py:637:init_distributed] cdb=None\n",
+      "[2023-12-28 15:44:11,015] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
+      "                                 dP            dP   dP \n",
+      "                                 88            88   88 \n",
+      "      .d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88 \n",
+      "      88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88 \n",
+      "      88.  .88  .d88b.  88.  .88 88 88.  .88   88   88 \n",
+      "      `88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP \n",
+      "                                                       \n",
+      "                                                       \n",
+      "\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:11,413] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2812] [RANK:0] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
+      "[2023-12-28 15:44:11,415] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2812] [RANK:0] Prepared dataset loaded from disk...\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:11,530] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2813] [RANK:1] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
+      "[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2814] [RANK:2] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
+      "[2023-12-28 15:44:12,160] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2813] [RANK:1] Prepared dataset loaded from disk...\u001b[39m\n",
+      "[2023-12-28 15:44:12,161] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2814] [RANK:2] Prepared dataset loaded from disk...\u001b[39m\n",
+      "[2023-12-28 15:44:12,236] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_tokens: 28120\u001b[39m\n",
+      "[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] `total_supervised_tokens: 7990`\u001b[39m\n",
+      "[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_steps: 6\u001b[39m\n",
+      "[2023-12-28 15:44:12,242] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading tokenizer... mistralai/Mistral-7B-v0.1\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:12,518] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading model and peft_config...\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
+      "[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
+      "[2023-12-28 15:44:13,049] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 7.24B\n",
+      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00,  5.81s/it]\n",
+      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00,  5.98s/it]\n",
+      "[2023-12-28 15:44:25,395] [INFO] [axolotl.load_model:503] [PID:2813] [RANK:1] GPU memory usage after model load: 7.576GB (+0.524GB cache, +0.708GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:25,399] [INFO] [axolotl.load_model:526] [PID:2813] [RANK:1] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+      "[2023-12-28 15:44:25,403] [INFO] [axolotl.load_model:538] [PID:2813] [RANK:1] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
+      "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
+      "[2023-12-28 15:44:25,480] [INFO] [axolotl.load_model:568] [PID:2813] [RANK:1] GPU memory usage after adapters: 7.589GB (+1.501GB cache, +0.708GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:25,572] [INFO] [axolotl.load_model:503] [PID:2814] [RANK:2] GPU memory usage after model load: 7.576GB (+0.410GB cache, +0.708GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:25,576] [INFO] [axolotl.load_model:526] [PID:2814] [RANK:2] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+      "[2023-12-28 15:44:25,580] [INFO] [axolotl.load_model:538] [PID:2814] [RANK:2] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
+      "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
+      "[2023-12-28 15:44:25,660] [INFO] [axolotl.load_model:568] [PID:2814] [RANK:2] GPU memory usage after adapters: 7.589GB (+1.388GB cache, +0.708GB misc)\u001b[39m\n",
+      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:12<00:00,  6.30s/it]\n",
+      "[2023-12-28 15:44:26,170] [INFO] [axolotl.load_model:503] [PID:2812] [RANK:0] GPU memory usage after model load: 7.576GB (+0.776GB cache, +0.741GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:26,177] [INFO] [axolotl.load_model:526] [PID:2812] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+      "[2023-12-28 15:44:26,181] [INFO] [axolotl.load_model:538] [PID:2812] [RANK:0] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
+      "trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
+      "[2023-12-28 15:44:26,259] [INFO] [axolotl.load_model:568] [PID:2812] [RANK:0] GPU memory usage after adapters: 7.589GB (+1.753GB cache, +0.741GB misc)\u001b[39m\n",
+      "[2023-12-28 15:44:26,293] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Pre-saving adapter config to ./out\u001b[39m\n",
+      "[2023-12-28 15:44:26,296] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Starting trainer...\u001b[39m\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
+      "Detected CUDA files, patching ldflags\n",
+      "Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
+      "Building extension module fused_adam...\n",
+      "Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
+      "ninja: no work to do.\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.05891108512878418 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10173463821411133 seconds\n",
+      "Loading extension module fused_adam...\n",
+      "Time to load fused_adam op: 0.10152459144592285 seconds\n",
+      "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
+      "  self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
+      "Parameter Offload: Total persistent parameters: 3674112 in 193 params\n",
+      "  0%|                                                    | 0/17 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+      "/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+      "/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
+      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
+      "{'loss': 2.0448, 'learning_rate': 2e-05, 'epoch': 0.06}                         \n",
+      "  6%|██▌                                         | 1/17 [00:28<07:32, 28.30s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.85s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 1.9694719314575195, 'eval_runtime': 11.391, 'eval_samples_per_second': 1.492, 'eval_steps_per_second': 0.263, 'epoch': 0.06}\n",
+      "  6%|██▌                                         | 1/17 [00:39<07:32, 28.30s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.65s/it]\u001b[A\n",
+      "                                                                                \u001b[A[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2812] [RANK:0] GPU memory usage while training: 12.210GB (+4.259GB cache, +0.776GB misc)\u001b[39m\n",
+      " 12%|█████▏                                      | 2/17 [01:04<08:18, 33.20s/it][2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2814] [RANK:2] GPU memory usage while training: 12.269GB (+4.522GB cache, +0.743GB misc)\u001b[39m\n",
+      "[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2813] [RANK:1] GPU memory usage while training: 12.283GB (+4.493GB cache, +0.743GB misc)\u001b[39m\n",
+      "{'loss': 2.0022, 'learning_rate': 4e-05, 'epoch': 0.12}                         \n",
+      "{'loss': 2.1054, 'learning_rate': 6e-05, 'epoch': 0.17}                         \n",
+      "{'loss': 1.9004, 'learning_rate': 8e-05, 'epoch': 0.23}                         \n",
+      "{'loss': 1.8794, 'learning_rate': 0.0001, 'epoch': 0.29}                        \n",
+      " 29%|████████████▉                               | 5/17 [02:20<05:23, 26.92s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.88s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 1.7912336587905884, 'eval_runtime': 11.3106, 'eval_samples_per_second': 1.503, 'eval_steps_per_second': 0.265, 'epoch': 0.29}\n",
+      " 29%|████████████▉                               | 5/17 [02:32<05:23, 26.92s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.67s/it]\u001b[A\n",
+      "{'loss': 1.7871, 'learning_rate': 0.00012, 'epoch': 0.35}                       \u001b[A\n",
+      "{'loss': 1.7758, 'learning_rate': 0.00014, 'epoch': 0.4}                        \n",
+      "{'loss': 1.4645, 'learning_rate': 0.00016, 'epoch': 0.46}                       \n",
+      "{'loss': 1.4009, 'learning_rate': 0.00018, 'epoch': 0.52}                       \n",
+      "{'loss': 1.3927, 'learning_rate': 0.0002, 'epoch': 0.58}                        \n",
+      " 59%|█████████████████████████▎                 | 10/17 [04:38<03:04, 26.33s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.89s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 1.1426481008529663, 'eval_runtime': 11.3344, 'eval_samples_per_second': 1.5, 'eval_steps_per_second': 0.265, 'epoch': 0.58}\n",
+      " 59%|█████████████████████████▎                 | 10/17 [04:49<03:04, 26.33s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.68s/it]\u001b[A\n",
+      "{'loss': 1.0122, 'learning_rate': 0.0001900968867902419, 'epoch': 0.63}         \u001b[A\n",
+      "{'loss': 1.0019, 'learning_rate': 0.00016234898018587337, 'epoch': 0.69}        \n",
+      "{'loss': 0.8976, 'learning_rate': 0.00012225209339563145, 'epoch': 0.75}        \n",
+      "{'loss': 0.9301, 'learning_rate': 7.774790660436858e-05, 'epoch': 0.81}         \n",
+      "{'loss': 0.8595, 'learning_rate': 3.7651019814126654e-05, 'epoch': 0.87}        \n",
+      " 88%|█████████████████████████████████████▉     | 15/17 [06:55<00:52, 26.17s/it]\n",
+      "  0%|                                                     | 0/3 [00:00<?, ?it/s]\u001b[A\n",
+      " 67%|██████████████████████████████               | 2/3 [00:03<00:01,  1.88s/it]\u001b[A\n",
+      "                                                                                \u001b[A\n",
+      "\u001b[A{'eval_loss': 0.8175248503684998, 'eval_runtime': 11.2932, 'eval_samples_per_second': 1.505, 'eval_steps_per_second': 0.266, 'epoch': 0.87}\n",
+      " 88%|█████████████████████████████████████▉     | 15/17 [07:06<00:52, 26.17s/it]\n",
+      "100%|█████████████████████████████████████████████| 3/3 [00:07<00:00,  2.67s/it]\u001b[A\n",
+      "{'loss': 0.7931, 'learning_rate': 9.903113209758096e-06, 'epoch': 0.92}         \u001b[A\n",
+      "{'loss': 0.6909, 'learning_rate': 0.0, 'epoch': 0.98}                           \n",
+      "100%|███████████████████████████████████████████| 17/17 [07:56<00:00, 28.03s/it]/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
+      "  warnings.warn(\n",
+      "{'train_runtime': 489.0649, 'train_samples_per_second': 0.63, 'train_steps_per_second': 0.035, 'train_loss': 1.408153467318591, 'epoch': 0.98}\n",
+      "100%|███████████████████████████████████████████| 17/17 [08:09<00:00, 28.77s/it]\n",
+      "[2023-12-28 15:52:39,488] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Training Completed!!! Saving pre-trained model to ./out\u001b[39m\n",
+      "\u001b[0m\u001b[0m\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "Training using the config.yml file and using deepspeed:zero3_bf16 the most aggressive optimization out of zero1,zero2,zero3 stages which partitions \n",
+    "not only optimizer states but also gradients and parameters across GPUs. The bf16 indicate mixed precision training using bfloat16.\n",
+    "For more information read axolotl's readme\n",
+    "\"\"\"\n",
+    "!accelerate launch -m axolotl.cli.train /folder/config.yml --deepspeed deepspeed_configs/zero3_bf16.json"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/mistral/Mistral-7b-example/config.yml
+++ b/examples/mistral/Mistral-7b-example/config.yml
@@ -0,0 +1,75 @@
+#Mistral-7b
+base_model: mistralai/Mistral-7B-v0.1
+model_type: MistralForCausalLM
+tokenizer_type: LlamaTokenizer
+is_mistral_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
+    #for type,conversation arguments read axolotl readme and pick what is suited for your project, I wanted a chatbot and put sharegpt and chatml
+    type: sharegpt
+    conversation: chatml
+dataset_prepared_path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
+val_set_size: 0.05
+output_dir: ./out
+
+#using lora for lower cost
+adapter: lora
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+
+sequence_len: 512
+sample_packing: false
+pad_to_sequence_len: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+#only 2 epochs because of small dataset
+gradient_accumulation_steps: 3
+micro_batch_size: 2
+num_epochs: 2
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: true
+fp16: false
+tf32: false
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: true
+
+warmup_steps: 10
+evals_per_epoch: 4
+eval_table_size:
+eval_table_max_new_tokens: 128
+saves_per_epoch: 1
+debug:
+#default deepspeed, can use more aggresive if needed like zero2, zero3
+deepspeed: deepspeed_configs/zero1.json
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/mistral/Mistral-7b-example/data.jsonl
+++ b/examples/mistral/Mistral-7b-example/data.jsonl
@@ -0,0 +1,10 @@
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who is the Founder of Apple\""}, {"from": "gpt", "value": "\"<Chatbot>: The founder of Apple is Steve Jobs\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the capital of France?\""}, {"from": "gpt", "value": "\"<Chatbot>: The capital of France is Paris.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: How far is the Moon from Earth?\""}, {"from": "gpt", "value": "\"<Chatbot>: The Moon is approximately 384,400 kilometers from Earth.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the tallest mountain in the world?\""}, {"from": "gpt", "value": "\"<Chatbot>: The tallest mountain in the world is Mount Everest.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who wrote Romeo and Juliet?\""}, {"from": "gpt", "value": "\"<Chatbot>: Romeo and Juliet was written by William Shakespeare.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the boiling point of water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The boiling point of water is 100 degrees Celsius.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: When was the first man on the moon?\""}, {"from": "gpt", "value": "\"<Chatbot>: The first man landed on the moon in 1969.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the largest ocean?\""}, {"from": "gpt", "value": "\"<Chatbot>: The largest ocean is the Pacific Ocean.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who invented the telephone?\""}, {"from": "gpt", "value": "\"<Chatbot>: The telephone was invented by Alexander Graham Bell.\""}]}
+{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the formula for water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The chemical formula for water is H2O.\""}]}
--- a/examples/mistral/README.md
+++ b/examples/mistral/README.md
@@ -8,5 +8,5 @@ accelerate launch -m axolotl.cli.train examples/mistral/config.yml

 If you run into CUDA OOM, use deepspeed with config zero2.json:
 ```shell
-accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed/zero2.json
+accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed_configs/zero2.json
 ```
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -34,8 +34,8 @@ learning_rate: 0.000005

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -63,8 +63,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
@@ -84,7 +84,7 @@ eval_table_size:
 eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
-deepspeed: deepspeed/zero2.json
+deepspeed: deepspeed_configs/zero2.json
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -50,8 +50,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -33,7 +33,7 @@ lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -52,6 +52,7 @@ logging_steps: 1
 xformers_attention:
 flash_attention: true
 gptq_groupsize:
+s2_attention:
 gptq_model_v1:
 warmup_steps: 20
 evals_per_epoch: 4
--- a/examples/phi/README.md
+++ b/examples/phi/README.md
@@ -3,7 +3,7 @@
 Due to some nuances with the phi code, please use deepspeed when training phi for full finetune.

 ```shell
-accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed/zero1.json
+accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed_configs/zero1.json

 # OR

--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-1_5
-model_type: PhiForCausalLM
+model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_llama_derived_model: false
-trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: false
@@ -18,7 +16,7 @@ output_dir: ./phi-sft-out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len:
+pad_to_sequence_len: true

 adapter:
 lora_model_dir:
@@ -35,7 +33,7 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_torch
 adam_beta2: 0.95
@@ -45,18 +43,20 @@ lr_scheduler: cosine
 learning_rate: 0.000003

 train_on_inputs: false
-group_by_length: true
-bf16: true
-fp16: false
+group_by_length: false
+bf16: auto
+fp16:
 tf32: true

-gradient_checkpointing:
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention:
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
@@ -68,7 +68,4 @@ fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
-  bos_token: "<|endoftext|>"
-  eos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-1_5
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-is_llama_derived_model: false
-trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: true
@@ -16,9 +14,9 @@ dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./phi-sft-out

-sequence_len: 1024
-sample_packing: false  # not CURRENTLY compatible with LoRAs
-pad_to_sequence_len:
+sequence_len: 2048
+sample_packing: true
+pad_to_sequence_len: true

 adapter: qlora
 lora_model_dir:
@@ -35,7 +33,7 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_torch
 adam_beta2: 0.95
@@ -45,18 +43,20 @@ lr_scheduler: cosine
 learning_rate: 0.000003

 train_on_inputs: false
-group_by_length: true
-bf16: true
-fp16: false
+group_by_length: false
+bf16: auto
+fp16:
 tf32: true

-gradient_checkpointing:
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention:
+flash_attention: true

 warmup_steps: 100
 evals_per_epoch: 4
@@ -68,7 +68,4 @@ fsdp:
 fsdp_config:
 resize_token_embeddings_to_32x: true
 special_tokens:
-  bos_token: "<|endoftext|>"
-  eos_token: "<|endoftext|>"
-  unk_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -1,8 +1,6 @@
 base_model: microsoft/phi-2
-model_revision:  834565c  # pin model repo to the previous architecture
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
-trust_remote_code: true

 load_in_8bit: false
 load_in_4bit: false
@@ -17,19 +15,16 @@ val_set_size: 0.05
 output_dir: ./phi-sft-out

 sequence_len: 2048
-sample_packing: false  # currently unsupported
-pad_to_sequence_len:
+sample_packing: true
+pad_to_sequence_len: true

 adapter:
 lora_model_dir:
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.1
-lora_target_linear: true
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_linear:
 lora_fan_in_fan_out:
-lora_modules_to_save:
-  - embd
-  - lm_head

 wandb_project:
 wandb_entity:
@@ -38,22 +33,24 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 4
-optimizer: paged_adamw_8bit
+optimizer: adamw_torch
 adam_beta2: 0.95
 adam_epsilon: 0.00001
 max_grad_norm: 1.0
 lr_scheduler: cosine
-learning_rate: 1e-5
+learning_rate: 0.000003

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: true

 gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/examples/pythia/lora.yml
+++ b/examples/pythia/lora.yml
@@ -27,7 +27,7 @@ num_epochs: 4
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: false
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -34,7 +34,7 @@ lr_scheduler: cosine
 learning_rate: 0.0000002
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 early_stopping_patience:
 resume_from_checkpoint:
--- a/examples/replit-3b/config-lora.yml
+++ b/examples/replit-3b/config-lora.yml
@@ -33,7 +33,7 @@ lr_scheduler:
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
-bf16: true
+bf16: auto
 tf32: true
 gradient_checkpointing:
 early_stopping_patience:
--- a/examples/tiny-llama/lora-mps.yml
+++ b/examples/tiny-llama/lora-mps.yml
@@ -0,0 +1,65 @@
+base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+is_llama_derived_model: true
+
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0
+output_dir: ./lora-out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+eval_sample_packing: false
+
+adapter: lora
+lora_model_dir:
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_fan_in_fan_out:
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 4
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+train_on_inputs: false
+group_by_length: false
+bf16: auto
+fp16: false
+tf32: true
+
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention: false
+
+warmup_steps: 10
+evals_per_epoch: 0
+saves_per_epoch: 1
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
--- a/examples/tiny-llama/lora.yml
+++ b/examples/tiny-llama/lora.yml
@@ -41,8 +41,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/tiny-llama/pretrain.yml
+++ b/examples/tiny-llama/pretrain.yml
@@ -12,6 +12,7 @@ max_steps: 200
 pretraining_dataset:
  path: c4
  name: en
+  type: pretrain
 dataset_prepared_path:
 val_set_size: 0.0
 output_dir: ./model-out
@@ -34,8 +35,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/tiny-llama/qlora.yml
+++ b/examples/tiny-llama/qlora.yml
@@ -43,8 +43,8 @@ learning_rate: 0.0002

 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false

 gradient_checkpointing: true
--- a/examples/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml
@@ -62,8 +62,8 @@ lr_scheduler: cosine
 learning_rate: 0.00002
 train_on_inputs: false
 group_by_length: false
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 gradient_checkpointing: true
 # stop training after this many evaluation losses have increased in a row
--- a/examples/yi-34B-chat/qlora.yml
+++ b/examples/yi-34B-chat/qlora.yml
@@ -7,8 +7,8 @@ load_in_8bit: false
 load_in_4bit: true
 strict: false
 sequence_len: 1024
-bf16: true
-fp16: false
+bf16: auto
+fp16:
 tf32: false
 flash_attention: true
 special_tokens:
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,3 +1,4 @@
 pre-commit
 black
 mypy
+types-requests
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,30 +1,29 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft==0.7.0
-transformers @ git+https://github.com/huggingface/transformers.git@3cefac1d974db5e2825a0cb2b842883a628be7a0
+peft @ git+https://github.com/huggingface/peft.git
+transformers @ git+https://github.com/huggingface/transformers.git@bebeeee01275c32fccec3fa36d8b148d3813a7dc
 tokenizers==0.15.0
 bitsandbytes>=0.41.1
-accelerate @ git+https://github.com/huggingface/accelerate.git@0d2280dadc6a93413a5496613b7fdda3a4d2551b
-deepspeed
+accelerate==0.26.1
+deepspeed>=0.13.1
 addict
 fire
 PyYAML>=6.0
+requests
 datasets>=2.15.0
 flash-attn==2.3.3
 sentencepiece
 wandb
 einops
 xformers==0.0.22
-optimum==1.13.2
+optimum==1.16.2
 hf_transfer
 colorama
 numba
 numpy>=1.24.4
 mlflow
 # qlora things
-bert-score==0.3.13
 evaluate==0.4.0
-rouge-score==0.1.2
 scipy
 scikit-learn==1.2.2
 pynvml
--- a/scripts/cloud-entrypoint.sh
+++ b/scripts/cloud-entrypoint.sh
@@ -5,16 +5,24 @@ echo "Exporting environment variables..."
 printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
 echo 'source /etc/rp_environment' >> ~/.bashrc

-if [[ $PUBLIC_KEY ]]
-then
+if [[ $PUBLIC_KEY ]]; then
+    # runpod
    mkdir -p ~/.ssh
    chmod 700 ~/.ssh
    echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
    chmod 700 -R ~/.ssh
    # Start the SSH service in the background
    service ssh start
+elif [ -n "$SSH_KEY" ]; then
+    # latitude.sh
+    mkdir -p ~/.ssh
+    chmod 700 ~/.ssh
+    echo $SSH_KEY >> ~/.ssh/authorized_keys
+    chmod 700 -R ~/.ssh
+    # Start the SSH service in the background
+    service ssh start
 else
-    echo "No PUBLIC_KEY ENV variable provided, not starting openSSH daemon"
+    echo "No PUBLIC_KEY or SSH_KEY environment variable provided, not starting openSSH daemon"
 fi

 # Check if JUPYTER_PASSWORD is set and not empty
@@ -25,7 +33,7 @@ fi

 if [ "$JUPYTER_DISABLE" != "1" ]; then
    # Run Jupyter Lab in the background
-    jupyter lab --allow-root --ip 0.0.0.0 &
+    jupyter lab --port=8888 --ip=* --allow-root --ServerApp.allow_origin=* --ServerApp.preferred_dir=/workspace &
 fi

 # Execute the passed arguments (CMD)
--- a/scripts/motd
+++ b/scripts/motd
@@ -0,0 +1,17 @@
+
+                                 dP            dP   dP
+                                 88            88   88
+      .d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88
+      88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88
+      88.  .88  .d88b.  88.  .88 88 88.  .88   88   88
+      `88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP
+
+Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands:
+
+```
+cd /workspace
+rm -rf /workspace/axolotl
+git clone https://github.com/OpenAccess-AI-Collective/axolotl.git
+cd axolotl
+pip install --no-deps -e .
+```
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,7 @@
 """setup.py for axolotl"""

+import platform
+import re
 from importlib.metadata import PackageNotFoundError, version

 from setuptools import find_packages, setup
@@ -26,10 +28,25 @@ def parse_requirements():
                _install_requires.append(line)

    try:
-        torch_version = version("torch")
-        if torch_version.startswith("2.1.1"):
+        if "Darwin" in platform.system():
            _install_requires.pop(_install_requires.index("xformers==0.0.22"))
-            _install_requires.append("xformers==0.0.23")
+        else:
+            torch_version = version("torch")
+            _install_requires.append(f"torch=={torch_version}")
+
+            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
+            if version_match:
+                major, minor, patch = version_match.groups()
+                major, minor = int(major), int(minor)
+                patch = (
+                    int(patch) if patch is not None else 0
+                )  # Default patch to 0 if not present
+            else:
+                raise ValueError("Invalid version format")
+
+            if (major, minor) >= (2, 1):
+                _install_requires.pop(_install_requires.index("xformers==0.0.22"))
+                _install_requires.append("xformers>=0.0.23")
    except PackageNotFoundError:
        pass

@@ -41,7 +58,7 @@ install_requires, dependency_links = parse_requirements()

 setup(
    name="axolotl",
-    version="0.3.0",
+    version="0.4.0",
    description="LLM Trainer",
    long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
    package_dir={"": "src"},
@@ -50,13 +67,14 @@ setup(
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.3.3",
+            "flash-attn==2.5.0",
        ],
        "fused-dense-lib": [
            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib",
        ],
        "deepspeed": [
-            "deepspeed",
+            "deepspeed>=0.13.1",
+            "deepspeed-kernels",
        ],
        "mamba-ssm": [
            "mamba-ssm==1.0.1",
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -1,23 +1,26 @@
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""

 import importlib
+import json
 import logging
 import math
 import os
 import random
 import sys
+import tempfile
 from pathlib import Path
 from threading import Thread
 from typing import Any, Dict, List, Optional, Union
+from urllib.parse import urlparse

 import gradio as gr
+import requests
 import torch
 import yaml

 # add src to the pythonpath so we don't need to pip install this
 from accelerate.commands.config import config_args
 from art import text2art
-from datasets import concatenate_datasets, load_dataset
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
 from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
@@ -30,7 +33,7 @@ from axolotl.utils.config import (
    normalize_config,
    validate_config,
 )
-from axolotl.utils.data import prepare_dataset
+from axolotl.utils.data import load_prepare_dpo_datasets, prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.mlflow_ import setup_mlflow_env_vars
@@ -60,6 +63,52 @@ def print_axolotl_text_art(suffix=None):
        print(ascii_art)


+def check_remote_config(config: Union[str, Path]):
+    # Check if the config is a valid HTTPS URL to a .yml or .yaml file
+    if not (isinstance(config, str) and config.startswith("https://")):
+        return config  # Return the original value if it's not a valid URL
+
+    filename = os.path.basename(urlparse(config).path)
+    temp_dir = tempfile.mkdtemp()
+
+    try:
+        response = requests.get(config, timeout=30)
+        response.raise_for_status()  # Check for HTTP errors
+
+        content = response.content
+        try:
+            # Try parsing as JSON first to catch cases where JSON content is mistakenly considered YAML
+            json.loads(content)
+            # Log a warning but do not raise an error; JSON is technically valid YAML - this can happen when you forget to point to a raw github link
+            LOG.warning(
+                f"Warning: The content of the file at {config} is JSON, which is technically valid YAML but might not be intended."
+            )
+        except json.JSONDecodeError:
+            # If it's not valid JSON, verify it's valid YAML
+            try:
+                yaml.safe_load(content)
+            except yaml.YAMLError as err:
+                raise ValueError(
+                    f"Failed to parse the content at {config} as YAML: {err}"
+                ) from err
+
+        # Write the content to a file if it's valid YAML (or JSON treated as YAML)
+        output_path = Path(temp_dir) / filename
+        with open(output_path, "wb") as file:
+            file.write(content)
+        LOG.info(
+            f"Using the following config obtained from {config}:\n\n{content.decode('utf-8')}\n"
+        )
+        return output_path
+
+    except requests.RequestException as err:
+        # This catches all requests-related exceptions including HTTPError
+        raise RuntimeError(f"Failed to download {config}: {err}") from err
+    except Exception as err:
+        # Catch-all for any other exceptions
+        raise err
+
+
 def get_multi_line_input() -> Optional[str]:
    print("Give me an instruction (Ctrl + D to submit): ")
    instruction = ""
@@ -79,7 +128,11 @@ def do_merge_lora(

    LOG.info("running merge of LoRA with base model")
    model = model.merge_and_unload(progressbar=True)
-    model.to(dtype=cfg.torch_dtype)
+    try:
+        model.to(dtype=cfg.torch_dtype)
+    except RuntimeError:
+        pass
+    model.generation_config.do_sample = True

    if cfg.local_rank == 0:
        LOG.info(f"saving merged model to: {str(Path(cfg.output_dir) / 'merged')}")
@@ -267,9 +320,10 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b
    return not any(el in list2 for el in list1)


-def load_cfg(config: Path = Path("examples/"), **kwargs):
+def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
+    config = check_remote_config(config)
    if Path(config).is_dir():
-        config = choose_config(config)
+        config = choose_config(Path(config))

    # load the config from the yaml file
    with open(config, encoding="utf-8") as file:
@@ -343,78 +397,7 @@ def load_rl_datasets(
    cfg: DictDefault,
    cli_args: TrainerCliArgs,  # pylint: disable=unused-argument
 ) -> TrainDatasetMeta:
-    train_datasets: List[Any] = []
-    for i, ds_cfg in enumerate(cfg.datasets):
-        train_datasets.insert(i, load_dataset(ds_cfg["path"], split=ds_cfg["split"]))
-    # eval_dataset = load_dataset(
-    #     cfg.test_datasets[0]["path"], split=cfg.test_datasets[0]["split"]
-    # )
-    eval_dataset = None
-
-    def argilla_apply_chatml(sample):  # pylint: disable=possibly-unused-variable
-        if "system" in sample and sample["system"]:
-            sample["prompt"] = (
-                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
-                f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
-            )
-        else:
-            sample[
-                "prompt"
-            ] = f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
-        sample["chosen"] = f"{sample['chosen_response']}<|im_end|>"
-        sample["rejected"] = f"{sample['rejected_response']}<|im_end|>"
-        return sample
-
-    def intel_apply_chatml(sample):  # pylint: disable=possibly-unused-variable
-        if "system" in sample and sample["system"]:
-            sample["prompt"] = (
-                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
-                f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
-            )
-        else:
-            sample[
-                "prompt"
-            ] = f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
-        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
-        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
-        return sample
-
-    def apply_chatml(sample):  # pylint: disable=possibly-unused-variable
-        if "system" in sample and sample["system"]:
-            sample["prompt"] = (
-                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
-                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
-            )
-        else:
-            sample[
-                "prompt"
-            ] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
-        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
-        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
-        return sample
-
-    def ultra_apply_chatml(sample):  # pylint: disable=possibly-unused-variable
-        if "system" in sample and sample["system"]:
-            sample["prompt"] = (
-                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
-                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
-            )
-        else:
-            sample[
-                "prompt"
-            ] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
-        sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>"
-        sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>"
-        return sample
-
-    for i, data_set in enumerate(train_datasets):
-        _type = cfg.datasets[i]["type"]
-        ds_type_fn = locals()[_type]
-        train_datasets[i] = data_set.map(ds_type_fn)
-    train_dataset = concatenate_datasets(train_datasets)
-
-    # eval_dataset = eval_dataset.map(intel_apply_chatml)
-
+    train_dataset, eval_dataset = load_prepare_dpo_datasets(cfg)
    total_num_steps = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
@@ -434,6 +417,13 @@ def check_accelerate_default_config():


 def check_user_token():
+    # Skip check if HF_HUB_OFFLINE is set to True
+    if os.getenv("HF_HUB_OFFLINE") == "1":
+        LOG.info(
+            "Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used."
+        )
+        return True
+
    # Verify if token is valid
    api = HfApi()
    try:
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -3,6 +3,7 @@ CLI to run training on a model
 """
 import logging
 from pathlib import Path
+from typing import Union

 import fire
 import transformers
@@ -13,18 +14,21 @@ from axolotl.cli import (
    check_user_token,
    load_cfg,
    load_datasets,
+    load_rl_datasets,
    print_axolotl_text_art,
 )
 from axolotl.common.cli import PreprocessCliArgs
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
+from axolotl.prompt_strategies.sharegpt import register_chatml_template

 LOG = logging.getLogger("axolotl.cli.preprocess")


-def do_cli(config: Path = Path("examples/"), **kwargs):
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    parsed_cfg = load_cfg(config, **kwargs)
+    parsed_cfg.is_preprocess = True
    check_accelerate_default_config()
    check_user_token()
    parser = transformers.HfArgumentParser((PreprocessCliArgs))
@@ -32,6 +36,14 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
        return_remaining_strings=True
    )

+    if parsed_cfg.chat_template == "chatml" and parsed_cfg.default_system_message:
+        LOG.info(
+            f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
+        )
+        register_chatml_template(parsed_cfg.default_system_message)
+    else:
+        register_chatml_template()
+
    if not parsed_cfg.dataset_prepared_path:
        msg = (
            Fore.RED
@@ -42,7 +54,11 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
        LOG.warning(msg)
        parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

-    _ = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    if parsed_cfg.rl:
+        load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
+    else:
+        load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
+
    LOG.info(
        Fore.GREEN
        + f"Success! Preprocessed data path: `dataset_prepared_path: {parsed_cfg.dataset_prepared_path}`"
--- a/src/axolotl/cli/shard.py
+++ b/src/axolotl/cli/shard.py
@@ -3,6 +3,7 @@ CLI to shard a trained model into 10GiB chunks
 """
 import logging
 from pathlib import Path
+from typing import Union

 import fire
 import transformers
@@ -25,7 +26,7 @@ def shard(
    model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)


-def do_cli(config: Path = Path("examples/"), **kwargs):
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    parsed_cfg = load_cfg(config, **kwargs)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -3,9 +3,12 @@ CLI to run training on a model
 """
 import logging
 from pathlib import Path
+from typing import Tuple, Union

 import fire
-import transformers
+from transformers.hf_argparser import HfArgumentParser
+from transformers.modeling_utils import PreTrainedModel
+from transformers.tokenization_utils import PreTrainedTokenizer

 from axolotl.cli import (
    check_accelerate_default_config,
@@ -16,27 +19,40 @@ from axolotl.cli import (
    print_axolotl_text_art,
 )
 from axolotl.common.cli import TrainerCliArgs
+from axolotl.prompt_strategies.sharegpt import register_chatml_template
 from axolotl.train import train

 LOG = logging.getLogger("axolotl.cli.train")


-def do_cli(config: Path = Path("examples/"), **kwargs):
+def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
-    print_axolotl_text_art()
-    check_accelerate_default_config()
-    check_user_token()
-    parser = transformers.HfArgumentParser((TrainerCliArgs))
+    parser = HfArgumentParser((TrainerCliArgs))
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
    )
+    return do_train(parsed_cfg, parsed_cli_args)

-    if parsed_cfg.rl:
-        dataset_meta = load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
+
+def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+    print_axolotl_text_art()
+    check_accelerate_default_config()
+    check_user_token()
+    if cfg.chat_template == "chatml" and cfg.default_system_message:
+        LOG.info(
+            f"ChatML set. Adding default system message: {cfg.default_system_message}"
+        )
+        register_chatml_template(cfg.default_system_message)
    else:
-        dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
-    train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
+        register_chatml_template()
+
+    if cfg.rl:
+        dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
+    else:
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+
+    return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)


 if __name__ == "__main__":
--- a/src/axolotl/common/cli.py
+++ b/src/axolotl/common/cli.py
@@ -6,6 +6,7 @@ import logging
 from dataclasses import dataclass, field
 from typing import Optional

+import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -12,22 +12,29 @@ from abc import abstractmethod
 from dataclasses import dataclass, field
 from functools import wraps
 from pathlib import Path
-from typing import Optional
+from typing import List, Optional, Type, Union

 import torch
 import transformers
 from datasets import Dataset
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
-from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
+from transformers import (
+    EarlyStoppingCallback,
+    Trainer,
+    TrainerCallback,
+    TrainingArguments,
+)
 from transformers.trainer_utils import seed_worker
 from trl import DPOTrainer

+from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
    GPUStatsCallback,
    LossWatchDogCallback,
+    SaveAxolotlConfigtoMlflowCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
@@ -37,11 +44,12 @@ from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
    MambaDataCollator,
+    V2BatchSamplerDataCollatorForSeq2Seq,
 )
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.schedulers import (
    get_cosine_schedule_with_min_lr,
-    get_cosine_schedule_with_quadratic_warmup,
+    get_cosine_schedule_with_quadratic_warmup, JaggedLRRestartScheduler,
 )

 try:
@@ -52,6 +60,22 @@ except ImportError:
 LOG = logging.getLogger("axolotl.core.trainer_builder")


+def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
+    if isinstance(tag_names, str):
+        tag_names = [tag_names]
+
+    if kwargs is not None:
+        if "tags" not in kwargs:
+            kwargs["tags"] = tag_names
+        elif "tags" in kwargs and isinstance(kwargs["tags"], list):
+            kwargs["tags"].extend(tag_names)
+        elif "tags" in kwargs and isinstance(kwargs["tags"], str):
+            tag_names.append(kwargs["tags"])
+            kwargs["tags"] = tag_names
+
+    return kwargs
+
+
@dataclass
 class AxolotlTrainingArguments(TrainingArguments):
    """
@@ -75,6 +99,10 @@ class AxolotlTrainingArguments(TrainingArguments):
        default=False,
        metadata={"help": "Use sample packing for efficient training."},
    )
+    multipack_real_batches: bool = field(
+        default=False,
+        metadata={"help": "Use real batches for efficient training."},
+    )
    eval_sample_packing: Optional[bool] = field(
        default=None,
        metadata={"help": "Use sample packing for efficient evals."},
@@ -99,6 +127,22 @@ class AxolotlTrainingArguments(TrainingArguments):
        default=None,
        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
    )
+    relora_anneal_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many anneal steps to take before reset for ReLoRA"},
+    )
+    jagged_restart_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to reset for jagged restarts"},
+    )
+    jagged_restarts_warmup_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many warmup steps to take after reset for jagged restarts"},
+    )
+    jagged_restarts_anneal_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many anneal steps to take before reset for jagged restarts"},
+    )
    bench_split: Optional[str] = field(
        default="eval", metadata={"help": "The benchmark split to run on"}
    )
@@ -163,24 +207,30 @@ class AxolotlTrainer(Trainer):
            num_training_steps (int): The number of training steps to do.
            optimizer (torch.optim.Optimizer): The training optimizer
        """
+        use_cosine_quadratic = (
+            self.args.lr_scheduler_type == "cosine"
+            and self.args.lr_quadratic_warmup is True
+        )
+
+        use_cosine_min_lr = (
+            self.args.lr_scheduler_type == "cosine"
+            and self.args.cosine_min_lr_ratio is not None
+        )

        # fmt: off
        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
            # fmt: on
-            if (
-                self.args.lr_scheduler_type == "cosine"
-                and self.args.lr_quadratic_warmup is True
-            ):
+            if use_cosine_quadratic:
+                if use_cosine_min_lr:
+                    LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
+
                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
                )
-            elif self.args.lr_scheduler_type == "cosine" and self.args.cosine_min_lr_ratio is not None:
+            elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-                if self.args.deepspeed:
-                    LOG.warning("Using cosine scheduler with deepspeed. This may be ignored if a scheduler is set \
-                                in the deepspeed JSON")
                self.lr_scheduler = get_cosine_schedule_with_min_lr(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
@@ -188,16 +238,46 @@ class AxolotlTrainer(Trainer):
                    min_lr_ratio=self.args.cosine_min_lr_ratio,
                )
            else:
-                return super().create_scheduler(num_training_steps, optimizer)
+                super().create_scheduler(num_training_steps, optimizer)
+        else:
+            if use_cosine_quadratic:
+                LOG.warning("axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
+
+            if use_cosine_min_lr:
+                LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
+
+        if self.args.jagged_restart_steps:
+            warmup_steps = (
+                self.args.jagged_restarts_warmup_steps or 10
+            )
+            anneal_steps = (
+                self.args.jagged_restarts_anneal_steps or 1
+            )
+            self.lr_scheduler = JaggedLRRestartScheduler(
+                optimizer,
+                self.lr_scheduler,
+                self.args.jagged_restart_steps,
+                warmup_steps,
+                anneal_steps,
+            )
+
        return self.lr_scheduler

    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        if self.args.sample_packing and not self.args.pretraining:
+            if self.args.multipack_real_batches:
+                batch_size = self.args.per_device_train_batch_size
+                batch_max_len = self.args.max_seq_length
+            else:
+                batch_size = 1
+                batch_max_len = (
+                    self.args.per_device_train_batch_size * self.args.max_seq_length
+                )
            return MultipackBatchSampler(
                RandomSampler(self.train_dataset),
-                self.args.train_batch_size,
+                batch_size=batch_size,
                drop_last=True,
-                batch_max_len=self._train_batch_size * self.args.max_seq_length,
+                batch_max_len=batch_max_len,
                lengths=get_dataset_lengths(self.train_dataset),
                packing_efficiency_estimate=self.args.sample_packing_efficiency,
            )
@@ -207,11 +287,19 @@ class AxolotlTrainer(Trainer):
        self, eval_dataset: Dataset
    ) -> Optional[torch.utils.data.Sampler]:
        if self.args.sample_packing and self.args.eval_sample_packing is not False:
+            if self.args.multipack_real_batches:
+                batch_size = self.args.per_device_eval_batch_size
+                batch_max_len = self.args.max_seq_length
+            else:
+                batch_size = 1
+                batch_max_len = (
+                    self.args.per_device_eval_batch_size * self.args.max_seq_length
+                )
            return MultipackBatchSampler(
                SequentialSampler(eval_dataset),
-                self.args.per_device_eval_batch_size,
+                batch_size=batch_size,
                drop_last=True,
-                batch_max_len=self.args.eval_batch_size * self.args.max_seq_length,
+                batch_max_len=batch_max_len,
                lengths=get_dataset_lengths(eval_dataset),
                packing_efficiency_estimate=self.args.sample_packing_efficiency,
            )
@@ -220,7 +308,8 @@ class AxolotlTrainer(Trainer):
    def get_train_dataloader(self) -> DataLoader:
        if self.args.sample_packing and not self.args.pretraining:
            train_dataset = self.train_dataset
-            train_dataset = train_dataset.remove_columns(["length"])
+            if "length" in train_dataset.features.keys():
+                train_dataset = train_dataset.remove_columns(["length"])
            data_collator = self.data_collator
            dataloader_params = {
                "batch_size": self._train_batch_size,
@@ -328,30 +417,13 @@ class AxolotlTrainer(Trainer):
        #     return (loss, outputs) if return_outputs else loss
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

-    def _sanitize_kwargs_for_tagging(self, tag_names, kwargs=None):
-        if isinstance(tag_names, str):
-            tag_names = [tag_names]
-
-        if kwargs is not None:
-            if "tags" not in kwargs:
-                kwargs["tags"] = tag_names
-            elif "tags" in kwargs and isinstance(kwargs["tags"], list):
-                kwargs["tags"].extend(tag_names)
-            elif "tags" in kwargs and isinstance(kwargs["tags"], str):
-                tag_names.append(kwargs["tags"])
-                kwargs["tags"] = tag_names
-
-        return kwargs
-
    @wraps(Trainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        """
-        kwargs = self._sanitize_kwargs_for_tagging(
-            tag_names=self.tag_names, kwargs=kwargs
-        )
+        kwargs = _sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)

        return super().push_to_hub(*args, **kwargs)

@@ -438,10 +510,14 @@ class ReLoRATrainer(AxolotlTrainer):
            warmup_steps = (
                self.args.relora_warmup_steps if self.args.relora_warmup_steps else 10
            )
+            anneal_steps = (
+                self.args.relora_anneal_steps if self.args.relora_anneal_steps else 1
+            )
            self.lr_scheduler = ReLoRAScheduler(
                optimizer,
                lr_scheduler,
                self.args.relora_steps,
+                anneal_steps,
                warmup_steps,
            )
        else:
@@ -450,6 +526,24 @@ class ReLoRATrainer(AxolotlTrainer):
        return self.lr_scheduler


+class AxolotlDPOTrainer(DPOTrainer):
+    """
+    Extend the base DPOTrainer for axolotl helpers
+    """
+
+    tag_names = ["axolotl", "dpo"]
+
+    @wraps(DPOTrainer.push_to_hub)
+    def push_to_hub(self, *args, **kwargs) -> str:
+        """
+        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
+        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
+        """
+        kwargs = _sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)
+
+        return super().push_to_hub(*args, **kwargs)
+
+
 class TrainerBuilderBase(abc.ABC):
    """
    Base class for trainer builder
@@ -458,6 +552,7 @@ class TrainerBuilderBase(abc.ABC):
    _train_dataset = None
    _eval_dataset = None
    _model_ref = None
+    _peft_config = None

    def __init__(self, cfg, model, tokenizer):
        self.cfg = cfg
@@ -488,13 +583,26 @@ class TrainerBuilderBase(abc.ABC):
    def eval_dataset(self, dataset):
        self._eval_dataset = dataset

+    @property
+    def peft_config(self):
+        return self._peft_config
+
+    @peft_config.setter
+    def peft_config(self, peft_config):
+        self._peft_config = peft_config
+
    @abstractmethod
    def build(self, total_num_steps):
        pass

-    @abstractmethod
-    def get_callbacks(self):
-        pass
+    def get_callbacks(self) -> List[TrainerCallback]:
+        callbacks = []
+        if self.cfg.use_wandb:
+            callbacks.append(
+                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
+            )
+
+        return callbacks

    @abstractmethod
    def get_post_trainer_create_callbacks(self, trainer):
@@ -502,12 +610,6 @@ class TrainerBuilderBase(abc.ABC):
        Callbacks added after the trainer is created, usually b/c these need access to the trainer
        """

-
-class HFCausalTrainerBuilder(TrainerBuilderBase):
-    """
-    Build the HuggingFace training args/trainer for Causal models
-    """
-
    def hook_pre_create_training_args(self, training_arguments_kwargs):
        # TODO
        return training_arguments_kwargs
@@ -524,10 +626,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        # TODO
        return trainer

+
+class HFCausalTrainerBuilder(TrainerBuilderBase):
+    """
+    Build the HuggingFace training args/trainer for Causal models
+    """
+
    def get_callbacks(self):
-        callbacks = []
+        callbacks = super().get_callbacks()
        callbacks.append(GPUStatsCallback(self.cfg))
-        callbacks.append(EvalFirstStepCallback)
+        callbacks.append(EvalFirstStepCallback())

        if self.cfg.relora_steps:
            callbacks.append(ReLoRACallback(self.cfg))
@@ -536,12 +644,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            hasattr(self.model, "use_bettertransformer")
            and self.model.use_bettertransformer is True
        ):
-            callbacks.append(SaveBetterTransformerModelCallback)
+            callbacks.append(SaveBetterTransformerModelCallback())

        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
            )
+        if self.cfg.use_mlflow:
+            callbacks.append(
+                SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
+            )

        if self.cfg.loss_watchdog_threshold is not None:
            callbacks.append(LossWatchDogCallback(self.cfg))
@@ -612,7 +724,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs[
                "gradient_checkpointing"
            ] = self.cfg.gradient_checkpointing
-            if self.cfg.gradient_checkpointing_kwargs:
+            if self.cfg.gradient_checkpointing_kwargs is not None:
                training_arguments_kwargs[
                    "gradient_checkpointing_kwargs"
                ] = self.cfg.gradient_checkpointing_kwargs
@@ -679,7 +791,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
            training_arguments_kwargs["dataloader_drop_last"] = True

-        if self.cfg.val_set_size == 0:
+        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
            # no eval set, so don't eval
            training_arguments_kwargs["evaluation_strategy"] = "no"
        elif self.cfg.eval_steps:
@@ -745,9 +857,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs[
            "per_device_train_batch_size"
        ] = self.cfg.micro_batch_size
-        training_arguments_kwargs[
-            "per_device_eval_batch_size"
-        ] = self.cfg.eval_batch_size
+        if self.cfg.eval_batch_size:
+            training_arguments_kwargs[
+                "per_device_eval_batch_size"
+            ] = self.cfg.eval_batch_size
        training_arguments_kwargs[
            "gradient_accumulation_steps"
        ] = self.cfg.gradient_accumulation_steps
@@ -765,6 +878,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.load_best_model_at_end is not False
                or self.cfg.early_stopping_patience
            )
+            and not self.cfg.test_datasets
            and self.cfg.val_set_size > 0
            and self.cfg.save_steps
            and self.cfg.eval_steps
@@ -786,6 +900,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["optim"] = (
            self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
        )
+        if self.cfg.save_only_model:
+            training_arguments_kwargs["save_only_model"] = self.cfg.save_only_model
        training_arguments_kwargs["lr_scheduler_type"] = (
            self.cfg.lr_scheduler
            if self.cfg.lr_scheduler
@@ -802,6 +918,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        training_arguments_kwargs["sample_packing"] = (
            self.cfg.sample_packing if self.cfg.sample_packing else False
        )
+        training_arguments_kwargs["multipack_real_batches"] = (
+            self.cfg.flash_attention is not True
+        )
        training_arguments_kwargs["eval_sample_packing"] = (
            self.cfg.sample_packing
            if self.cfg.eval_sample_packing is not False
@@ -812,6 +931,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        ] = self.cfg.micro_batch_size
        training_arguments_kwargs["relora_steps"] = self.cfg.relora_steps
        training_arguments_kwargs["relora_warmup_steps"] = self.cfg.relora_warmup_steps
+        training_arguments_kwargs["relora_anneal_steps"] = self.cfg.relora_anneal_steps
        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
@@ -896,14 +1016,27 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if is_eval and training_args.eval_sample_packing:
            use_batch_sampler_collator = True

+        collator: Type[
+            Union[
+                V2BatchSamplerDataCollatorForSeq2Seq,
+                BatchSamplerDataCollatorForSeq2Seq,
+                DataCollatorForSeq2Seq,
+            ]
+        ]
        if use_batch_sampler_collator:
-            return BatchSamplerDataCollatorForSeq2Seq(
-                self.tokenizer,
-                return_tensors="pt",
-                **kwargs,
-            )
+            if self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
+                collator = V2BatchSamplerDataCollatorForSeq2Seq
+            elif (
+                self.cfg.model_config_type in ["llama"]
+                and self.cfg.flash_attention is not True
+            ):
+                collator = V2BatchSamplerDataCollatorForSeq2Seq
+            else:
+                collator = BatchSamplerDataCollatorForSeq2Seq
+        else:
+            collator = DataCollatorForSeq2Seq

-        return DataCollatorForSeq2Seq(
+        return collator(
            self.tokenizer,
            return_tensors="pt",
            **kwargs,
@@ -916,7 +1049,7 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
    """

    def get_callbacks(self):
-        callbacks = []
+        callbacks = super().get_callbacks()
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
@@ -934,21 +1067,82 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
        ]:
            if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
                training_args_kwargs[arg] = getattr(self.cfg, arg)
+
+        if self.cfg.hub_model_id:
+            training_args_kwargs["hub_model_id"] = self.cfg.hub_model_id
+            training_args_kwargs["push_to_hub"] = True
+            training_args_kwargs["hub_private_repo"] = True
+            training_args_kwargs["hub_always_push"] = True
+
+            if self.cfg.hub_strategy:
+                training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy
+
+        if self.cfg.save_safetensors is not None:
+            training_args_kwargs["save_safetensors"] = self.cfg.save_safetensors
+
+        if self.eval_dataset:
+            training_args_kwargs["evaluation_strategy"] = "steps"
+            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
+        else:
+            training_args_kwargs["evaluation_strategy"] = "no"
+        if self.cfg.bf16 or self.cfg.bfloat16:
+            training_args_kwargs["bf16"] = True
+
+        training_args_kwargs["lr_scheduler_type"] = (
+            self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
+        )
+        training_args_kwargs["lr_scheduler_kwargs"] = (
+            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
+        )
+        if self.cfg.remove_unused_columns is not None:
+            training_args_kwargs[
+                "remove_unused_columns"
+            ] = self.cfg.remove_unused_columns
+        else:
+            training_args_kwargs["remove_unused_columns"] = False
+
+        if self.cfg.dataloader_pin_memory is not None:
+            training_args_kwargs[
+                "dataloader_pin_memory"
+            ] = self.cfg.dataloader_pin_memory
+        if self.cfg.dataloader_num_workers is not None:
+            training_args_kwargs[
+                "dataloader_num_workers"
+            ] = self.cfg.dataloader_num_workers
+        if self.cfg.dataloader_prefetch_factor is not None:
+            training_args_kwargs[
+                "dataloader_prefetch_factor"
+            ] = self.cfg.dataloader_prefetch_factor
+        if self.cfg.gradient_checkpointing:
+            training_args_kwargs[
+                "gradient_checkpointing"
+            ] = self.cfg.gradient_checkpointing
+            if self.cfg.gradient_checkpointing_kwargs is not None:
+                training_args_kwargs[
+                    "gradient_checkpointing_kwargs"
+                ] = self.cfg.gradient_checkpointing_kwargs
+            else:
+                training_args_kwargs["gradient_checkpointing_kwargs"] = {
+                    "use_reentrant": False
+                }
+
+        # set save_strategy and save_steps
+        if self.cfg.save_steps:
+            training_args_kwargs["save_strategy"] = "steps"
+            training_args_kwargs["save_steps"] = self.cfg.save_steps
+        elif self.cfg.save_strategy:
+            training_args_kwargs["save_strategy"] = self.cfg.save_strategy
+        else:
+            # default to saving each epoch if not defined
+            training_args_kwargs["save_strategy"] = "epoch"
+
        training_args = TrainingArguments(
            per_device_train_batch_size=self.cfg.micro_batch_size,
-            max_steps=total_num_steps,
-            remove_unused_columns=False,
+            max_steps=self.cfg.max_steps or total_num_steps,
            gradient_accumulation_steps=self.cfg.gradient_accumulation_steps,
            learning_rate=self.cfg.learning_rate,
-            evaluation_strategy="no",
-            # eval_steps=self.cfg.eval_steps,
-            save_strategy="steps",
-            save_steps=self.cfg.save_steps,
            output_dir=self.cfg.output_dir,
            warmup_steps=self.cfg.warmup_steps,
-            bf16=True,
-            gradient_checkpointing=self.cfg.gradient_checkpointing,
-            gradient_checkpointing_kwargs={"use_reentrant": False},
            logging_first_step=True,
            logging_steps=1,
            optim=self.cfg.optimizer,
@@ -967,22 +1161,31 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
                dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
        elif self.cfg.rl == "kto_pair":
            dpo_trainer_kwargs["loss_type"] = "kto_pair"
-
-        dpo_trainer = DPOTrainer(
+        if self.eval_dataset:
+            dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
+        if self.cfg.adapter and self.peft_config:
+            dpo_trainer_kwargs["peft_config"] = self.peft_config
+        if self.cfg.precompute_ref_log_probs is not None:
+            dpo_trainer_kwargs[
+                "precompute_ref_log_probs"
+            ] = self.cfg.precompute_ref_log_probs
+        dpo_trainer = AxolotlDPOTrainer(
            self.model,
            self.model_ref,
            args=training_args,
            beta=self.cfg.dpo_beta or 0.1,
            train_dataset=self.train_dataset,
-            # eval_dataset=self.eval_dataset,
-            eval_dataset=None,
            tokenizer=self.tokenizer,
            max_length=self.cfg.sequence_len,
            max_target_length=None,
            max_prompt_length=self.cfg.sequence_len,
            generate_during_eval=True,
+            callbacks=self.get_callbacks(),
            **dpo_trainer_kwargs,
        )
+        dpo_trainer = self.hook_post_create_trainer(dpo_trainer)
+        for callback in self.get_post_trainer_create_callbacks(dpo_trainer):
+            dpo_trainer.add_callback(callback)

        return dpo_trainer

--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -24,26 +24,30 @@ class TokenizedPromptDataset(Dataset):
        Args:
            prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
            dataset (dataset.Dataset): Dataset with text files.
+            process_count (int): Number of processes to use for tokenizing.
+            keep_in_memory (bool): Whether to keep the tokenized dataset in memory.
    """

    def __init__(  # pylint: disable=super-init-not-called
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
-        dataset: IterableDataset,
+        dataset: Dataset,
        process_count: Optional[int] = None,
+        keep_in_memory: Optional[bool] = False,
        **kwargs,
    ):
        self.prompt_tokenizer = prompt_tokenizer
        self.process_count = process_count
-        super().__init__(self.process(dataset).data, **kwargs)
+        self.keep_in_memory = keep_in_memory
+        super().__init__(
+            self.process(dataset).data,
+            **kwargs,
+        )

    def process(self, dataset):
        features = dataset.features.keys()
-        num_proc = (
-            min(64, self.process_count)
-            if self.process_count
-            else min(64, os.cpu_count())
-        )
+        num_proc = min(64, self.process_count if self.process_count else os.cpu_count())
+
        map_kwargs = {}
        if self.prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
@@ -52,6 +56,8 @@ class TokenizedPromptDataset(Dataset):
            self.prompt_tokenizer.tokenize_prompt,
            num_proc=num_proc,
            remove_columns=features,
+            keep_in_memory=self.keep_in_memory,
+            desc="Tokenizing Prompts",
            **map_kwargs,
        )

--- a/src/axolotl/models/phi/init.py
+++ b/src/axolotl/models/phi/init.py
@@ -1,8 +0,0 @@
-"""
-MixFormers model architecture used for phi models
-"""
-
-from .configuration_mixformer_sequential import MixFormerSequentialConfig  # noqa
-from .configuration_phi import PhiConfig  # noqa
-from .modeling_mixformer_sequential import MixFormerSequentialForCausalLM  # noqa
-from .modeling_phi import PhiForCausalLM  # noqa
--- a/src/axolotl/models/phi/configuration_mixformer_sequential.py
+++ b/src/axolotl/models/phi/configuration_mixformer_sequential.py
@@ -1,63 +0,0 @@
-# pylint: skip-file
-
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import math
-from typing import Any, Dict, List, Optional, Union
-
-from transformers import PretrainedConfig
-
-
-class MixFormerSequentialConfig(PretrainedConfig):
-    """MixFormer (sequential for DeepSpeed) configuration."""
-
-    model_type = "mixformer-sequential"
-
-    attribute_map = {
-        "max_position_embeddings": "n_positions",
-        "hidden_size": "n_embd",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-        "input_emb_layer": "embd_layer",  # `input_emb_layer` key is for backward compatibility
-        "blocks": "architecture",  # `blocks` key is for backward compatibility
-    }
-
-    def __init__(
-        self,
-        vocab_size: Optional[int] = 50304,
-        n_positions: Optional[int] = 2048,
-        n_embd: Optional[int] = 1024,
-        n_layer: Optional[int] = 20,
-        n_inner: Optional[int] = None,
-        n_head: Optional[int] = 16,
-        rotary_dim: Optional[int] = 32,
-        activation_function: Optional[str] = "gelu_new",
-        embd_layer: Optional[str] = "default",
-        architecture: Union[Dict[str, Any], List[Dict[str, Any]]] = None,
-        embd_pdrop: Optional[float] = 0.0,
-        resid_pdrop: Optional[float] = 0.0,
-        layer_norm_epsilon: Optional[float] = 1e-5,
-        initializer_range: Optional[float] = 0.02,
-        tie_word_embeddings: Optional[bool] = False,
-        pad_vocab_size_multiple: Optional[int] = 64,
-        **kwargs
-    ) -> None:
-        self.vocab_size = int(
-            math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-        )
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_inner = n_inner
-        self.n_head = n_head
-        self.rotary_dim = min(rotary_dim, n_embd // n_head)
-        self.activation_function = activation_function
-        self.embd_layer = embd_layer
-        self.architecture = architecture
-        self.embd_pdrop = embd_pdrop
-        self.resid_pdrop = resid_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/src/axolotl/models/phi/configuration_phi.py
+++ b/src/axolotl/models/phi/configuration_phi.py
@@ -1,65 +0,0 @@
-# pylint: skip-file
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import math
-from typing import Optional
-
-from transformers import PretrainedConfig
-
-
-class PhiConfig(PretrainedConfig):
-    """Phi configuration."""
-
-    model_type = "phi"
-    attribute_map = {
-        "max_position_embeddings": "n_positions",
-        "hidden_size": "n_embd",
-        "num_attention_heads": "n_head",
-        "num_hidden_layers": "n_layer",
-    }
-
-    def __init__(
-        self,
-        vocab_size: int = 50304,
-        n_positions: int = 2048,
-        n_embd: int = 1024,
-        n_layer: int = 20,
-        n_inner: Optional[int] = None,
-        n_head: int = 16,
-        n_head_kv: Optional[int] = None,
-        rotary_dim: Optional[int] = 32,
-        activation_function: Optional[str] = "gelu_new",
-        flash_attn: bool = False,
-        flash_rotary: bool = False,
-        fused_dense: bool = False,
-        attn_pdrop: float = 0.0,
-        embd_pdrop: float = 0.0,
-        resid_pdrop: float = 0.0,
-        layer_norm_epsilon: float = 1e-5,
-        initializer_range: float = 0.02,
-        tie_word_embeddings: bool = False,
-        pad_vocab_size_multiple: int = 64,
-        **kwargs
-    ) -> None:
-        self.vocab_size = int(
-            math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
-        )
-        self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_inner = n_inner
-        self.n_head = n_head
-        self.n_head_kv = n_head_kv
-        self.rotary_dim = min(rotary_dim, n_embd // n_head)
-        self.activation_function = activation_function
-        self.flash_attn = flash_attn
-        self.flash_rotary = flash_rotary
-        self.fused_dense = fused_dense
-        self.attn_pdrop = attn_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.resid_pdrop = resid_pdrop
-        self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
-
-        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
--- a/src/axolotl/models/phi/modeling_mixformer_sequential.py
+++ b/src/axolotl/models/phi/modeling_mixformer_sequential.py
@@ -1,930 +0,0 @@
-# pylint: skip-file
-
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-# BSD 3-Clause License
-#
-# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-#   contributors may be used to endorse or promote products derived from
-#   this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from __future__ import annotations
-
-import copy
-import inspect
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional, Tuple
-
-import torch
-import torch.nn as nn
-from einops import rearrange
-from flash_attn.flash_attn_interface import (
-    flash_attn_kvpacked_func,
-    flash_attn_qkvpacked_func,
-    flash_attn_varlen_qkvpacked_func,
-)
-from transformers import PretrainedConfig, PreTrainedModel
-from transformers.activations import ACT2FN
-from transformers.modeling_outputs import CausalLMOutputWithPast
-
-from ...monkeypatch.utils import get_cu_seqlens_from_pos_ids
-from .configuration_mixformer_sequential import MixFormerSequentialConfig
-
-
-@dataclass
-class InferenceParams:
-    """Inference parameters that are passed to the main model in order
-    to efficienly calculate and store the context during inference.
-    Adapted from https://github.com/Dao-AILab/flash-attention."""
-
-    max_sequence_len: int
-    max_batch_size: int
-    sequence_len_offset: int = 0
-    batch_size_offset: int = 0
-    key_value_memory_dict: dict = field(default_factory=dict)
-    fused_ft_kernel: bool = False
-    lengths_per_sample: Optional[torch.Tensor] = None
-
-
-class Embedding(nn.Module):
-    """Token embedding with dropout."""
-
-    def __init__(self, config: PretrainedConfig) -> None:
-        super().__init__()
-
-        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
-        self.drop = nn.Dropout(config.embd_pdrop)
-
-    def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_shape[-1])
-
-        hidden_states = self.wte(input_ids)
-        hidden_states = self.drop(hidden_states)
-
-        return hidden_states
-
-
-class RotaryEmbedding(nn.Module):
-    """PyTorch implementation of `flash-attn` RotaryEmbedding layer.
-    Adapted from https://github.com/Dao-AILab/flash-attention."""
-
-    def __init__(
-        self,
-        dim: int,
-        base: Optional[int] = 10000,
-        scale_base: Optional[float] = None,
-        device: Optional[str] = None,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-
-        if scale_base is not None:
-            raise NotImplementedError
-
-        # Generate and save the inverse frequency buffer (non-trainable)
-        self.dim = dim
-        self.base = base
-        self.scale_base = scale_base
-        self.device = device
-
-        inv_freq = 1.0 / (
-            base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
-        )
-        self.register_buffer("inv_freq", inv_freq)
-
-        scale = (
-            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
-            / (1.4 * dim)
-            if scale_base is not None
-            else None
-        )
-        self.register_buffer("scale", scale)
-
-        self._seq_len_cached = 0
-        self._cos_cached = None
-        self._sin_cached = None
-        self._cos_k_cached = None
-        self._sin_k_cached = None
-
-    def _update_cos_sin_cache(
-        self, x: torch.FloatTensor, seqlen_offset: Optional[int] = 0
-    ) -> None:
-        # Reset the tables if the sequence length has changed,
-        # or if we're on a new device (possibly due to tracing for instance)
-        seqlen = x.shape[1] + seqlen_offset
-
-        # Re-generate the inverse frequency buffer if it's not fp32
-        # (for instance if model.half() was called)
-        if self.inv_freq.dtype != "torch.float32":
-            self.inv_freq = 1.0 / (
-                self.base
-                ** (
-                    torch.arange(
-                        0, self.dim, 2, device=self.device, dtype=torch.float32
-                    )
-                    / self.dim
-                )
-            )
-
-        if (
-            seqlen > self._seq_len_cached
-            or self._cos_cached.device != x.device
-            or self._cos_cached.dtype != x.dtype
-        ):
-            self._seq_len_cached = seqlen
-            t = torch.arange(seqlen, device=x.device, dtype=torch.float32)
-
-            # Don't do einsum, it converts fp32 to fp16
-            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            freqs = torch.outer(
-                t, self.inv_freq.to(device=t.device, dtype=torch.float32)
-            )
-            if self.scale is None:
-                self._cos_cached = torch.cos(freqs).to(x.dtype)
-                self._sin_cached = torch.sin(freqs).to(x.dtype)
-            else:
-                power = (
-                    torch.arange(
-                        seqlen, dtype=self.scale.dtype, device=self.scale.device
-                    )
-                    - seqlen // 2
-                ) / self.scale_base
-                scale = self.scale.to(device=power.device) ** rearrange(
-                    power, "s -> s 1"
-                )
-
-                # We want the multiplication by scale to happen in fp32
-                self._cos_cached = (torch.cos(freqs) * scale).to(x.dtype)
-                self._sin_cached = (torch.sin(freqs) * scale).to(x.dtype)
-                self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
-                self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
-
-    def apply_rotary_emb_qkv(
-        self,
-        qkv: torch.FloatTensor,
-        sin: torch.FloatTensor,
-        cos: torch.FloatTensor,
-        sin_k: Optional[torch.FloatTensor] = None,
-        cos_k: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        _, seqlen, three, _, headdim = qkv.shape
-        assert three == 3
-
-        rotary_seqlen, rotary_dim = cos.shape
-        rotary_dim *= 2
-        assert rotary_dim <= headdim
-        assert seqlen <= rotary_seqlen
-
-        cos_k = cos if cos_k is None else cos_k
-        sin_k = sin if sin_k is None else sin_k
-        assert (
-            sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen, rotary_dim // 2)
-        )
-
-        q_rot = qkv[:, :, 0, :, :rotary_dim]
-        q_pass = qkv[:, :, 0, :, rotary_dim:]
-
-        k_rot = qkv[:, :, 1, :, :rotary_dim]
-        k_pass = qkv[:, :, 1, :, rotary_dim:]
-
-        # Splits the queries and keys in half
-        q1, q2 = q_rot.chunk(2, dim=-1)
-        k1, k2 = k_rot.chunk(2, dim=-1)
-        c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(
-            sin[:seqlen], "s d -> s 1 d"
-        )
-
-        # Casts to fp32 are necessary to prevent fp16 overflow issues
-        q1, q2, k1, k2, c, s = [
-            t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]
-        ]
-
-        # Computes the new keys and queries, recasting to original dtype
-        q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
-
-        k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
-
-        return torch.cat(
-            [
-                torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
-                torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
-                qkv[:, :, 2:3, :, :],
-            ],
-            axis=2,
-        )
-
-    def forward(
-        self, qkv: torch.Tensor, seqlen_offset: int = 0
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """Perform the forward pass.
-
-        Args:
-            qkv: Query, key and value tensors of shape (batch, seqlen, nheads, headdim) or (batch, seqlen, 3, nheads, headdim).
-            seqlen_offset: Used in generation where the passed `qkv` is only the last token in the batch.
-
-        Returns:
-            New `qkv` and the cached sinusoids.
-
-        """
-
-        self._update_cos_sin_cache(qkv, seqlen_offset)
-
-        return self.apply_rotary_emb_qkv(
-            qkv, self._sin_cached[seqlen_offset:], self._cos_cached[seqlen_offset:]
-        )
-
-
-def _update_kv_cache(kv, inference_params, layer_idx):
-    """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)
-    Adapted from https://github.com/Dao-AILab/flash-attention."""
-    # Pre-allocate memory for key-values for inference.
-    num_heads, head_dim = kv.shape[-2:]
-    if layer_idx not in inference_params.key_value_memory_dict:
-        kv_cache = torch.empty(
-            inference_params.max_batch_size,
-            inference_params.max_sequence_len,
-            2,
-            num_heads,
-            head_dim,
-            dtype=kv.dtype,
-            device=kv.device,
-        )
-        inference_params.key_value_memory_dict[layer_idx] = kv_cache
-    else:
-        kv_cache = inference_params.key_value_memory_dict[layer_idx]
-
-    # Adjust key and value for inference
-    batch_start = inference_params.batch_size_offset
-    batch_end = batch_start + kv.shape[0]
-    sequence_start = inference_params.sequence_len_offset
-    sequence_end = sequence_start + kv.shape[1]
-    assert batch_end <= (
-        kv_cache.shape[0] if kv_cache is not None else v_cache.shape[0]  # noqa
-    )
-    assert sequence_end <= (
-        kv_cache.shape[1] if kv_cache is not None else v_cache.shape[2]  # noqa
-    )
-
-    assert kv_cache is not None
-    kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv
-    kv = kv_cache[batch_start:batch_end, :sequence_end, ...]
-    return kv
-
-
-class MLP(nn.Module):
-    """Multi-Layer Perceptron.
-
-    Reference:
-        Attention Is All You Need.
-        https://arxiv.org/pdf/1706.03762.pdf.
-
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        n_inner: Optional[int] = None,
-        act_fn: Optional[str] = None,
-    ) -> None:
-        super().__init__()
-
-        act_fn = config.activation_function if act_fn is None else act_fn
-        assert act_fn in ACT2FN.keys(), f"`act_fn` must be one of: {ACT2FN.keys()}."
-
-        n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
-        n_inner = n_inner if n_inner is not None else 4 * config.n_embd
-
-        self.fc1 = nn.Linear(config.n_embd, n_inner)
-        self.fc2 = nn.Linear(n_inner, config.n_embd)
-        self.act = ACT2FN[act_fn]
-
-    def _load_from_state_dict(
-        self,
-        state_dict,
-        prefix,
-        local_metadata,
-        strict,
-        missing_keys,
-        unexpected_keys,
-        error_msgs,
-    ):
-        old_keys = [
-            prefix + "fc_in.weight",
-            prefix + "fc_out.weight",
-            prefix + "fc_in.bias",
-            prefix + "fc_out.bias",
-        ]
-        new_keys = [
-            prefix + "fc1.weight",
-            prefix + "fc2.weight",
-            prefix + "fc1.bias",
-            prefix + "fc2.bias",
-        ]
-
-        if all(k in state_dict for k in old_keys) and not all(
-            k in state_dict for k in new_keys
-        ):
-            # Older version of `MLP` saved with different key names.
-            for old_key, new_key in zip(old_keys, new_keys):
-                state_dict[new_key] = state_dict.pop(old_key)
-
-        return super()._load_from_state_dict(
-            state_dict,
-            prefix,
-            local_metadata,
-            strict,
-            missing_keys,
-            unexpected_keys,
-            error_msgs,
-        )
-
-    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.act(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-
-        return hidden_states
-
-
-class FusedMLP(nn.Module):
-    """Fused Multi-Layer Perceptron from `flash-attn`.
-
-    Reference:
-        https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/ops/fused_dense.py.
-
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        n_inner: Optional[int] = None,
-        act_fn: Optional[str] = None,
-        raise_on_missing: bool = False,
-    ) -> None:
-        super().__init__()
-
-        act_fn = config.activation_function if act_fn is None else act_fn
-        assert act_fn in ACT2FN.keys(), f"`act_fn` must be one of: {ACT2FN.keys()}."
-
-        n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
-        n_inner = n_inner if n_inner is not None else 4 * config.n_embd
-
-        gelu_activations = ["gelu_new", "gelu_fast", "gelu_approx"]  # noqa
-        activation = "gelu_approx" if act_fn in gelu_activations else "relu"  # noqa
-
-        self.mlp = MLP(config, n_inner=n_inner, act_fn=act_fn)
-
-    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        return self.mlp(hidden_states)
-
-
-class SelfAttention(nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Adapted from https://github.com/Dao-AILab/flash-attention.
-    Arguments
-    ---------
-        softmax_scale: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.0)
-    """
-
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
-        super().__init__()
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.drop = nn.Dropout(attention_dropout)
-
-    def forward(
-        self, qkv, causal=None, key_padding_mask=None, cu_seqlens=None, max_seqlen=None
-    ):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
-            causal: if passed, will override self.causal
-            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
-                False means to mask out. (B, S)
-        """
-        causal = self.causal if causal is None else causal
-        if cu_seqlens is not None:
-            return flash_attn_varlen_qkvpacked_func(
-                qkv.squeeze(0),
-                cu_seqlens,
-                max_seqlen,
-                dropout_p=self.drop.p,
-                softmax_scale=self.softmax_scale,
-                causal=causal,
-            )
-        else:
-            return flash_attn_qkvpacked_func(
-                qkv,
-                dropout_p=self.drop.p,
-                softmax_scale=self.softmax_scale,
-                causal=causal,
-            )
-
-
-class CrossAttention(nn.Module):
-    """Implement the scaled dot product attention with softmax.
-    Adapted from https://github.com/Dao-AILab/flash-attention.
-    Arguments
-    ---------
-        softmax_scale: The temperature to use for the softmax attention.
-                      (default: 1/sqrt(d_keys) where d_keys is computed at
-                      runtime)
-        attention_dropout: The dropout rate to apply to the attention
-                           (default: 0.0)
-    """
-
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
-        super().__init__()
-        self.causal = causal
-        self.softmax_scale = softmax_scale
-        self.drop = nn.Dropout(attention_dropout)
-
-    def forward(self, q, kv, causal=None, key_padding_mask=None):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            q: The tensor containing the query. (B, Sq, H, D)
-            kv: The tensor containing the key and value. (B, Sk, 2, H, D)
-            causal: if passed, will override self.causal
-            key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
-                False means to mask out. (B, Sk)
-        """
-        causal = self.causal if causal is None else causal
-        return flash_attn_kvpacked_func(
-            q,
-            kv,
-            dropout_p=self.drop.p,
-            softmax_scale=self.softmax_scale,
-            causal=causal,
-        )
-
-
-def find_mha_dims(
-    config: PretrainedConfig,
-    n_head: Optional[int] = None,
-    head_dim: Optional[int] = None,
-) -> Tuple[int, int]:
-    """Validate and return the number of heads and head dimension for multi-head attention.
-
-    Args:
-        config: Model configuration.
-        n_head: Number of heads.
-        head_dim: Head dimension.
-
-    Returns:
-        Number of heads and head dimension.
-
-    """
-
-    assert all(
-        hasattr(config, attr) for attr in ["n_embd", "n_head"]
-    ), "`config` must have `n_embd` and `n_head` attributes."
-
-    if head_dim is None:
-        assert (
-            config.n_embd % config.n_head == 0
-        ), f"Hidden size ({config.n_embd}) must be divisible by the number of heads ({config.n_head})."
-
-    if n_head is None and head_dim is None:
-        head_dim = config.n_embd // config.n_head
-        n_head = config.n_head
-    elif n_head is None or head_dim is None:
-        raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
-
-    return n_head, head_dim
-
-
-class MHA(nn.Module):
-    """Multi-head attention layer.
-    Adapted from https://github.com/Dao-AILab/flash-attention."""
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        rotary_dim: Optional[int] = None,
-        n_head: Optional[int] = None,
-        head_dim: Optional[int] = None,
-        bias: Optional[bool] = True,
-        dropout: Optional[float] = 0.0,
-        softmax_scale: Optional[float] = None,
-        causal: Optional[bool] = True,
-        layer_idx: Optional[int] = None,
-        rotary_emb_scale_base: Optional[float] = None,
-        return_residual: Optional[bool] = False,
-        checkpointing: Optional[bool] = False,
-        device: Optional[str] = None,
-        dtype: Optional[torch.dtype] = None,
-        fused_dense: Optional[bool] = True,
-        flash_attn: Optional[bool] = True,
-        cutlass_attn: Optional[bool] = False,
-        flash_rotary: Optional[bool] = True,
-        raise_on_missing: Optional[bool] = False,
-    ) -> None:
-        super().__init__()
-
-        factory_kwargs = {"device": device, "dtype": dtype}
-        n_head, head_dim = find_mha_dims(config, n_head, head_dim)
-
-        self.hidden_size = config.n_embd
-        self.n_head = n_head
-        self.head_dim = head_dim
-        self.op_size = n_head * head_dim
-
-        self.causal = causal
-        self.layer_idx = layer_idx
-        self.rotary_emb_dim = (
-            rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
-        )
-        self.fused_dense = fused_dense
-        self.flash_attn = flash_attn
-        self.cutlass_attn = cutlass_attn
-        self.flash_rotary = flash_rotary
-        self.return_residual = return_residual
-        self.checkpointing = checkpointing
-
-        if self.rotary_emb_dim > 0:
-            rotary_kwargs = {"device": device}
-            if rotary_emb_scale_base is not None and rotary_emb_scale_base > 0.0:
-                rotary_kwargs["scale_base"] = rotary_emb_scale_base
-
-            self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, **rotary_kwargs)
-        else:
-            pass
-
-        self.Wqkv = nn.Linear(
-            self.hidden_size, 3 * self.op_size, bias=bias, **factory_kwargs
-        )
-        self.out_proj = nn.Linear(
-            self.op_size, self.hidden_size, bias=bias, **factory_kwargs
-        )
-
-        self.inner_attn = SelfAttention(
-            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
-        )
-        self.inner_cross_attn = CrossAttention(
-            causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
-        )
-
-    def _update_kv_cache(
-        self, kv: torch.FloatTensor, inference_params: InferenceParams
-    ) -> None:
-        """kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)
-        Adapted from https://github.com/Dao-AILab/flash-attention."""
-
-        assert (
-            self.layer_idx is not None
-        ), "Generation requires layer_idx in the constructor"
-
-        return _update_kv_cache(kv, inference_params, self.layer_idx)
-
-    def forward(
-        self,
-        x: torch.FloatTensor,
-        x_kv: Optional[torch.FloatTensor] = None,
-        key_padding_mask: Optional[torch.BoolTensor] = None,
-        cu_seqlens: Optional[torch.LongTensor] = None,
-        max_seqlen: Optional[int] = None,
-        mixer_subset: Optional[torch.LongTensor] = None,
-        past_cache: Optional[InferenceParams] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
-        """Perform the forward pass.
-
-        Args:
-            x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
-                cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
-                is the is the sum of the sequence lengths in the batch.
-            x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
-            key_padding_mask: boolean mask, True means to keep, False means to mask out.
-                (batch, seqlen). Only applicable when not using FlashAttention.
-            cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
-                of the sequences in the batch, used to index into x. Only applicable when using
-                FlashAttention.
-            max_seqlen: int. Maximum sequence length in the batch.
-            mixer_subset: for cross-attention only. If not None, will take a subset of x
-                before applying the query projection. Useful for e.g., ViT where we only care
-                about the CLS token in the last layer.
-            past_cache: For generation only.
-
-        Returns:
-            (batch, seqlen, hidden_dim) if cu_seqlens is None and max_seqlen is None,
-                else (total, hidden_dim) where total is the is the sum of the sequence lengths
-                in the batch.
-
-        """
-
-        if cu_seqlens is not None:
-            assert max_seqlen is not None
-            assert key_padding_mask is None
-            assert self.flash_attn
-            # assert self.rotary_emb_dim == 0
-
-        if key_padding_mask is not None:
-            assert cu_seqlens is None
-            assert max_seqlen is None
-            assert not self.flash_attn
-
-        if past_cache is not None:
-            assert key_padding_mask is None
-            assert cu_seqlens is None and max_seqlen is None
-
-        attn_kwargs = {"key_padding_mask": key_padding_mask}
-
-        assert x_kv is None and mixer_subset is None
-
-        qkv = self.Wqkv(x)
-        qkv = rearrange(
-            qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim
-        )
-
-        if past_cache is None:
-            if self.rotary_emb_dim > 0:
-                qkv = self.rotary_emb(qkv)
-                context = self.inner_attn(
-                    qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, **attn_kwargs
-                )
-
-        else:
-            if self.rotary_emb_dim > 0:
-                qkv = self.rotary_emb(qkv, seqlen_offset=past_cache.sequence_len_offset)
-            q = qkv[:, :, 0]
-            kv = self._update_kv_cache(qkv[:, :, 1:], past_cache)
-            # If we're processing the prompt, causal=None (use self.causal).
-            # If we're decoding, then causal=False.
-            causal = None if past_cache.sequence_len_offset == 0 else False
-            context = self.inner_cross_attn(q, kv, causal=causal)
-
-        out = rearrange(context, "... h d -> ... (h d)")
-        out = self.out_proj(out)
-
-        return out if not self.return_residual else (out, x)
-
-
-class ParallelBlock(nn.Module):
-    """Parallel block.
-
-    This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
-
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        mixer: Optional[Dict[str, Any]] = None,
-        mlp: Optional[Dict[str, Any]] = None,
-        block_idx: Optional[int] = None,
-    ) -> None:
-        super().__init__()
-
-        self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-        self.block_idx = block_idx
-
-        self.mixer = MHA(config, layer_idx=block_idx)
-        self.mlp = MLP(config)
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        past_cache: Optional[torch.FloatTensor] = None,
-        cu_seqlens: Optional[torch.LongTensor] = None,
-        max_seqlen: Optional[int] = None,
-    ) -> torch.FloatTensor:
-        residual = hidden_states
-        hidden_states = self.ln(hidden_states)
-
-        attn_outputs = self.mixer(
-            hidden_states,
-            past_cache=past_cache,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-        )
-        if isinstance(attn_outputs, tuple):
-            attn_outputs = attn_outputs[0]
-
-        attn_outputs = self.resid_dropout(attn_outputs)
-        feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
-
-        hidden_states = attn_outputs + feed_forward_hidden_states + residual
-
-        return hidden_states
-
-
-class CausalLMHead(nn.Module):
-    """Causal Language Modeling head.
-
-    Reference:
-        Improving Language Understanding by Generative Pre-Training.
-        https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
-
-    """
-
-    def __init__(self, config: PretrainedConfig) -> None:
-        super().__init__()
-
-        self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-        self.linear = nn.Linear(config.n_embd, config.vocab_size)
-
-    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        hidden_states = self.ln(hidden_states)
-        logits = self.linear(hidden_states).to(torch.float32)
-
-        return logits
-
-
-class CausalLMLoss(nn.Module):
-    """Causal Language Modeling loss.
-
-    Reference:
-        Improving Language Understanding by Generative Pre-Training.
-        https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
-
-    """
-
-    def __init__(self, shift_labels: Optional[bool] = True) -> None:
-        super().__init__()
-
-        self.shift_labels = shift_labels
-        self.loss_fct = nn.CrossEntropyLoss()
-
-    def forward(
-        self, logits: torch.FloatTensor, labels: torch.LongTensor
-    ) -> torch.FloatTensor:
-        if self.shift_labels:
-            logits = logits[..., :-1, :].contiguous()
-            labels = labels[..., 1:].contiguous()
-
-        loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
-
-        return loss
-
-
-class MixFormerSequentialPreTrainedModel(PreTrainedModel):
-    """MixFormer (sequential for DeepSpeed) pre-trained model."""
-
-    config_class = MixFormerSequentialConfig
-    base_model_prefix = "transformer"
-    supports_gradient_checkpointing = True
-
-    def __init__(self, *inputs, **kwargs) -> None:
-        super().__init__(*inputs, **kwargs)
-
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, **kwargs
-    ) -> Dict[str, Any]:
-        if "use_cache" in kwargs and not kwargs["use_cache"]:
-            return {"input_ids": input_ids}
-
-        if past_key_values is None or not (
-            isinstance(past_key_values, InferenceParams)
-        ):
-            past_key_values = InferenceParams(
-                max_batch_size=input_ids.shape[0],
-                max_sequence_len=self.config.n_positions,
-                sequence_len_offset=0,
-                batch_size_offset=0,
-                fused_ft_kernel=False,
-                key_value_memory_dict={},
-            )
-        else:
-            # assume past_key_values has cached all but last token in input_ids
-            past_key_values.sequence_len_offset = len(input_ids[0]) - 1
-            input_ids = input_ids[:, -1].unsqueeze(-1)
-
-        return {"input_ids": input_ids, "past_key_values": past_key_values, **kwargs}
-
-
-class PackedSequential(nn.Sequential):
-    def forward(
-        self,
-        input,
-        cu_seqlens: Optional[torch.LongTensor] = None,
-        max_seqlen: Optional[int] = None,
-    ):
-        for module in self:
-            sig = inspect.signature(module.forward)
-            if "cu_seqlens" in sig.parameters:
-                input = module(input, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
-            else:
-                input = module(input)
-        return input
-
-
-class MixFormerSequentialForCausalLM(MixFormerSequentialPreTrainedModel):
-    """MixFormer (sequential for DeepSpeed) for Causal Language Modeling."""
-
-    _keys_to_ignore_on_load_missing = [""]
-    _keys_to_ignore_on_load_unexpected = [
-        r"layers\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"
-    ]
-    _no_split_modules = ["ParallelBlock"]
-
-    def __init__(self, config: MixFormerSequentialConfig) -> None:
-        super().__init__(config)
-
-        modules = [Embedding(config)]
-        block_config = config.architecture
-
-        if not isinstance(block_config, list):
-            block_config = [block_config for _ in range(config.n_layer)]
-
-        if config.n_layer != len(block_config):
-            config.n_layer = len(block_config)
-
-        for block_idx, block in enumerate(block_config):
-            # `block_cls` with `legacy` value is for backward compatibility
-            # `path` key is for backward compatibility
-            block = copy.deepcopy(block) or {"block_cls": "parallel"}
-            block.pop("path", None) or block.pop("block_cls", None)
-
-            block["block_idx"] = block_idx
-            modules.append(ParallelBlock(config, **block))
-
-        modules.append(CausalLMHead(config))
-
-        self.layers = PackedSequential(*modules)
-        self.loss = CausalLMLoss()
-
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Embedding:
-        return self.layers[0].wte
-
-    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
-        self.layers[0].wte = new_embeddings
-
-    def get_output_embeddings(self) -> nn.Linear:
-        return self.layers[-1].linear
-
-    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
-        self.layers[-1].linear = new_embeddings
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        labels: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> CausalLMOutputWithPast:
-        cu_seqlens: Optional[torch.LongTensor] = None
-        max_seqlen: Optional[int] = None
-        if position_ids is not None:
-            batch_size, seq_length = input_ids.shape
-            position_ids = position_ids.view(-1, seq_length).long()
-            cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
-            cu_seqlens = cu_seqlens.squeeze()
-
-        if not past_key_values:
-            lm_logits = self.layers(
-                input_ids, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
-            )
-        else:
-            hidden_layer = self.layers[0](input_ids)
-            for module in self.layers[1:-1]:
-                hidden_layer = module(
-                    hidden_layer,
-                    past_cache=past_key_values,
-                    cu_seqlens=cu_seqlens,
-                    max_seqlen=max_seqlen,
-                )
-            lm_logits = self.layers[-1](hidden_layer)
-
-        loss = None
-        if labels is not None:
-            loss = self.loss(lm_logits, labels)
-
-        return CausalLMOutputWithPast(
-            loss=loss, logits=lm_logits, past_key_values=past_key_values
-        )
--- a/src/axolotl/models/phi/modeling_phi.py
+++ b/src/axolotl/models/phi/modeling_phi.py
--- a/src/axolotl/monkeypatch/data/init.py
+++ b/src/axolotl/monkeypatch/data/init.py
--- a/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
+++ b/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
@@ -0,0 +1,46 @@
+"""monkey patches for the dataset fetcher to handle batches of packed indexes"""
+# pylint: disable=protected-access
+
+import torch
+from torch.utils.data._utils.fetch import _BaseDatasetFetcher
+from torch.utils.data._utils.worker import _worker_loop
+
+
+class _MapDatasetFetcher(_BaseDatasetFetcher):
+    def fetch(self, possibly_batched_index):
+        if isinstance(possibly_batched_index[0], list):
+            data = [None for i in possibly_batched_index]
+            for i, possibly_batched_index_ in enumerate(possibly_batched_index):
+                if self.auto_collation:
+                    if (
+                        hasattr(self.dataset, "__getitems__")
+                        and self.dataset.__getitems__
+                    ):
+                        data[i] = self.dataset.__getitems__(possibly_batched_index_)
+                    else:
+                        data[i] = [self.dataset[idx] for idx in possibly_batched_index_]
+                else:
+                    data[i] = self.dataset[possibly_batched_index_]
+        else:
+            if self.auto_collation:
+                if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
+                    data = self.dataset.__getitems__(possibly_batched_index)
+                else:
+                    data = [self.dataset[idx] for idx in possibly_batched_index]
+            else:
+                data = self.dataset[possibly_batched_index]
+        return self.collate_fn(data)
+
+
+def patch_fetchers():
+    torch.utils.data._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher
+    torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher
+
+
+def patched_worker_loop(*args, **kwargs):
+    patch_fetchers()
+    return _worker_loop(*args, **kwargs)
+
+
+torch.utils.data._utils.worker._worker_loop = patched_worker_loop
+patch_fetchers()
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -70,11 +70,20 @@ def replace_llama_attn_with_flash_attn(
    packed: Optional[bool] = False,
    cross_entropy: Optional[bool] = False,
    rms_norm: Optional[bool] = False,
+    use_shifted_sparse_attn: Optional[bool] = False,
 ):
    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
        _prepare_decoder_attention_mask
    )
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = flashattn_forward
+    if use_shifted_sparse_attn:
+        transformers.models.llama.modeling_llama.LlamaAttention.forward = (
+            flashattn_forward_with_s2attn
+        )
+    else:
+        transformers.models.llama.modeling_llama.LlamaAttention.forward = (
+            flashattn_forward
+        )
+
    if packed:
        transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
        transformers.models.llama.modeling_llama.LlamaModel.forward = (
@@ -213,6 +222,136 @@ def _prepare_decoder_attention_mask(
    return attention_mask


+GROUP_SIZE_RATIO = 1 / 4
+
+
+def flashattn_forward_with_s2attn(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    padding_mask: Optional[torch.LongTensor] = None,  # pylint: disable=unused-argument
+    cu_seqlens: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+    max_seqlen: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+
+    From: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
+
+    attention_mask: [bsz, q_len]
+
+    `cu_seqlens` will be ignored if provided
+    `max_seqlen` will be ignored if provided
+    """
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+    # pylint: disable=duplicate-code
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+
+    # Past Key value support
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+
+    key_padding_mask = attention_mask.repeat(2, 1)
+    nheads = qkv.shape[-2]
+    # shift
+
+    group_size = int(q_len * GROUP_SIZE_RATIO)
+    if q_len % group_size > 0:
+        raise ValueError(
+            f"q_len {q_len} should be divisible by group size {group_size}."
+        )
+
+    qkv = (
+        qkv.reshape(bsz, q_len, 3, 2, self.num_heads // 2, self.head_dim)
+        .permute(0, 3, 1, 2, 4, 5)
+        .reshape(bsz * 2, q_len, 3, self.num_heads // 2, self.head_dim)
+    )
+    x = rearrange(  # pylint: disable=invalid-name
+        qkv, "b s three h d -> b s (three h d)"
+    )
+    x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+    cu_q_len_tmp = torch.arange(
+        0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype
+    )
+    cu_q_len_tmp = torch.stack([cu_q_len_tmp, cu_q_len_tmp + group_size // 2]).repeat(
+        bsz, 1
+    ) + cu_q_lens[:-1].unsqueeze(-1)
+    cu_q_lens = torch.cat([cu_q_len_tmp, cu_q_lens[1:].unsqueeze(-1)], dim=-1).view(-1)
+
+    x_unpad = rearrange(
+        x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads // 2
+    )
+    output_unpad = flash_attn_varlen_qkvpacked_func(
+        x_unpad, cu_q_lens, group_size, 0.0, softmax_scale=None, causal=True
+    )
+    output = rearrange(
+        pad_input(
+            rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz * 2, q_len
+        ),
+        "b s (h d) -> b s h d",
+        h=nheads // 2,
+    )
+    output = (
+        output.reshape(bsz, 2, q_len, nheads // 2, self.head_dim)
+        .transpose(1, 2)
+        .reshape(bsz, q_len, nheads, self.head_dim)
+    )
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
+
+
 def flashattn_forward(
    self,
    hidden_states: torch.Tensor,
--- a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
@@ -1,142 +0,0 @@
-"""
-Patched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention
-"""
-
-import warnings
-from typing import Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-import transformers.models.llama.modeling_llama
-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
-
-
-def hijack_llama_sdp_attention():
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = (
-        sdp_attention_forward
-    )
-
-
-def sdp_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    padding_mask: Optional[torch.LongTensor] = None,  # pylint: disable=unused-argument
-    **kwargs,  # pylint: disable=unused-argument
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    # pylint: disable=duplicate-code
-    bsz, q_len, _ = hidden_states.size()
-
-    if not hasattr(self, "pretraining_tp"):
-        self.pretraining_tp = 1
-
-    if self.pretraining_tp > 1:
-        key_value_slicing = (
-            self.num_key_value_heads * self.head_dim
-        ) // self.pretraining_tp
-        query_slices = self.q_proj.weight.split(
-            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
-        )
-        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-        query_states = [
-            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        query_states = torch.cat(query_states, dim=-1)
-
-        key_states = [
-            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        key_states = torch.cat(key_states, dim=-1)
-
-        value_states = [
-            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        value_states = torch.cat(value_states, dim=-1)
-
-    else:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(
-        bsz, q_len, self.num_heads, self.head_dim
-    ).transpose(1, 2)
-    key_states = key_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    value_states = value_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    # [bsz, q_len, nh, hd]
-    # [bsz, nh, q_len, hd]
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
-    )
-    # [bsz, nh, t, hd]
-
-    if past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = torch.cat([past_key_value[0], key_states], dim=2)
-        value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    if output_attentions:
-        warnings.warn(
-            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
-        )
-
-    #
-    # sdp-attn start
-    #
-
-    with torch.backends.cuda.sdp_kernel():
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            is_causal=False,
-        )
-
-    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-    attn_output = attn_output.transpose(1, 2)
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    #
-    # sdp-attn end
-    #
-
-    if self.pretraining_tp > 1:
-        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-        o_proj_slices = self.o_proj.weight.split(
-            self.hidden_size // self.pretraining_tp, dim=1
-        )
-        attn_output = sum(
-            F.linear(attn_output[i], o_proj_slices[i])
-            for i in range(self.pretraining_tp)
-        )
-    else:
-        attn_output = self.o_proj(attn_output)
-
-    return attn_output, None, past_key_value
--- a/src/axolotl/monkeypatch/llama_expand_mask.py
+++ b/src/axolotl/monkeypatch/llama_expand_mask.py
@@ -5,38 +5,11 @@ from typing import Optional

 import torch

+from axolotl.monkeypatch.utils import mask_2d_to_4d
+

 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    This expansion handles packed sequences so that sequences share the same attention mask integer value
-    when they attend to each other within that sequence.
-    This expansion transforms the mask to lower triangular form to prevent future peeking.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    mask = mask.unsqueeze(1).unsqueeze(2)
-    mask = mask.expand(bsz, 1, tgt_len, src_len)
-
-    # Create a binary mask from the original mask where zeros remain zeros and all other values are set to one
-    binary_mask = torch.where(
-        mask != 0,
-        torch.tensor(1).to(dtype),
-        torch.tensor(0).to(dtype),
-    )
-
-    # Create a block-diagonal mask.
-    # we multiply by the binary mask so that 0's in the original mask are correctly excluded
-    zero_one_mask = torch.eq(mask, mask.transpose(-1, -2)).int() * binary_mask
-
-    # Now let's create a lower triangular mask of ones that will zero out the upper triangular part
-    lower_triangular_ones = torch.tril(torch.ones((tgt_len, src_len), dtype=dtype)).to(
-        mask.device
-    )
-
-    # Use the lower triangular mask to zero out the upper triangular part of the zero_one_mask
-    masked_zero_one_mask = zero_one_mask * lower_triangular_ones
+    masked_zero_one_mask = mask_2d_to_4d(mask, dtype, tgt_len)
    inverted_mask = 1.0 - masked_zero_one_mask

    return inverted_mask.masked_fill(
--- a/src/axolotl/monkeypatch/llama_patch_multipack.py
+++ b/src/axolotl/monkeypatch/llama_patch_multipack.py
@@ -0,0 +1,26 @@
+"""
+Patched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention
+"""
+
+from axolotl.monkeypatch.utils import (
+    patched_prepare_4d_causal_attention_mask,
+    patched_prepare_4d_causal_attention_mask_for_sdpa,
+)
+
+
+def hijack_llama_prepare_4d_mask():
+    import transformers.modeling_attn_mask_utils
+    import transformers.models.llama.modeling_llama
+
+    transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask_for_sdpa = (  # pylint: disable=protected-access
+        patched_prepare_4d_causal_attention_mask_for_sdpa
+    )
+    transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask_for_sdpa = (  # pylint: disable=protected-access
+        patched_prepare_4d_causal_attention_mask_for_sdpa
+    )
+    transformers.models.llama.modeling_llama._prepare_4d_causal_attention_mask = (  # pylint: disable=protected-access
+        patched_prepare_4d_causal_attention_mask
+    )
+    transformers.modeling_attn_mask_utils._prepare_4d_causal_attention_mask = (  # pylint: disable=protected-access
+        patched_prepare_4d_causal_attention_mask
+    )
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -94,7 +94,7 @@ def _prepare_decoder_attention_mask(
    sliding_window,
 ):  # pylint: disable=unused-argument
    # [bsz, seq_len]
-    if attention_mask is None:
+    if attention_mask is None or sliding_window is None:
        return attention_mask

    # NOTE: attention mask and sliding masks are only broadcastable in certain scenarios.
@@ -151,7 +151,7 @@ def flashattn_forward(
    )

    use_sliding_windows = (
-        hasattr(self.config, "sliding_window") is not None
+        getattr(self.config, "sliding_window") is not None
        and kv_seq_len > self.config.sliding_window
    )

--- a/src/axolotl/monkeypatch/mixtral/init.py
+++ b/src/axolotl/monkeypatch/mixtral/init.py
@@ -1,22 +1,50 @@
 """
 Patches to support multipack for mixtral
 """
-import transformers
+import torch


-def replace_mixtral_attn_with_multipack_flash_attn():
-    from .modeling_mixtral import (
-        MixtralMultipackFlashAttention2,
-        mixtral_decoder_layer_forward,
-        mixtral_model_forward,
+def patch_mixtral_moe_forward_zero3() -> None:
+    import torch.nn.functional as F
+
+    def mlp_forward(self, hidden_states):
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(
+            hidden_states
+        )
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+
+    # Ref. https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
+    def moe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        topk_weight, topk_idx = torch.topk(
+            routing_weights, self.top_k, dim=-1, sorted=False
+        )
+        topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        topk_weight = topk_weight.to(hidden_states.dtype)
+
+        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
+        y = torch.empty_like(hidden_states)  # pylint: disable=invalid-name
+        flat_topk_idx = topk_idx.view(-1)
+        for i in range(self.num_experts):
+            expert = self.experts[i]
+            y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
+        y = (  # pylint: disable=invalid-name
+            y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)
+        ).sum(dim=1)
+        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+    from transformers.models.mixtral.modeling_mixtral import (
+        MixtralBLockSparseTop2MLP,
+        MixtralSparseMoeBlock,
    )

-    transformers.models.mixtral.modeling_mixtral.MixtralDecoderLayer.forward = (
-        mixtral_decoder_layer_forward
-    )
-    transformers.models.mixtral.modeling_mixtral.MixtralModel.forward = (
-        mixtral_model_forward
-    )
-    transformers.models.mixtral.modeling_mixtral.MIXTRAL_ATTENTION_CLASSES[
-        "flash_attention_2"
-    ] = MixtralMultipackFlashAttention2
+    MixtralBLockSparseTop2MLP.forward = mlp_forward
+    MixtralSparseMoeBlock.forward = moe_forward
--- a/src/axolotl/monkeypatch/mixtral/modeling_mixtral.py
+++ b/src/axolotl/monkeypatch/mixtral/modeling_mixtral.py
@@ -1,383 +0,0 @@
-"""
-Mixtral modeling for multipack
-"""
-# pylint: disable=missing-module-docstring,unused-argument,protected-access,pointless-string-statement,duplicate-code
-import logging
-import warnings
-from typing import List, Optional, Tuple, Union
-
-import torch
-from einops import rearrange
-from flash_attn import flash_attn_varlen_qkvpacked_func
-from transformers import Cache, DynamicCache
-from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
-from transformers.modeling_outputs import MoeModelOutputWithPast
-from transformers.models.mixtral.modeling_mixtral import (
-    MixtralFlashAttention2,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-
-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
-
-LOG = logging.getLogger("axolotl.monkeypatch.mixtral")
-
-
-class MixtralMultipackFlashAttention2(MixtralFlashAttention2):
-    """
-    Custom multipack implementation w flash attention 2
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._flash_attn_uses_top_left_mask = True
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if "padding_mask" in kwargs:
-            warnings.warn(
-                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-            )
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(
-            bsz, q_len, self.num_heads, self.head_dim
-        ).transpose(1, 2)
-        key_states = key_states.view(
-            bsz, q_len, self.num_key_value_heads, self.head_dim
-        ).transpose(1, 2)
-        value_states = value_states.view(
-            bsz, q_len, self.num_key_value_heads, self.head_dim
-        ).transpose(1, 2)
-
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            if self.layer_idx is None:
-                raise ValueError(
-                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
-                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
-                    "with a layer index."
-                )
-            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, position_ids
-        )
-
-        if past_key_value is not None:
-            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
-            key_states, value_states = past_key_value.update(
-                key_states, value_states, self.layer_idx, cache_kwargs
-            )
-
-        # repeat k/v heads if n_kv_heads < n_heads
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
-            # special handling using sample packing
-            qkv = torch.stack(
-                [query_states, key_states, value_states], dim=2
-            )  # [bsz, nh, 3, q_len, hd]
-            qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-            qkv = rearrange(qkv, "b s ... -> (b s) ...")
-
-            attn_output = flash_attn_varlen_qkvpacked_func(
-                qkv,
-                cu_seqlens,
-                max_seqlen,
-                dropout_p=self.attention_dropout,
-                softmax_scale=None,
-                causal=True,
-            )
-            attn_output = rearrange(attn_output, "(b s) ... -> b s ...", b=bsz)
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-def mixtral_decoder_layer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,
-    output_router_logits: Optional[bool] = False,
-    use_cache: Optional[bool] = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[torch.Tensor] = None,
-    **kwargs,
-) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
-    """
-    Args:
-        hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-        attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-            `(batch, sequence_length)` where padding elements are indicated by 0.
-        past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-            returned tensors for more detail.
-        output_router_logits (`bool`, *optional*):
-            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
-            should not be returned during inference.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-            (see `past_key_values`).
-    """
-
-    residual = hidden_states
-
-    hidden_states = self.input_layernorm(hidden_states)
-
-    # Self Attention
-    hidden_states, self_attn_weights, present_key_value = self.self_attn(
-        hidden_states=hidden_states,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_value=past_key_value,
-        output_attentions=output_attentions,
-        use_cache=use_cache,
-        cu_seqlens=cu_seqlens,
-        max_seqlen=max_seqlen,
-    )
-    hidden_states = residual + hidden_states
-
-    # Fully Connected
-    residual = hidden_states
-    hidden_states = self.post_attention_layernorm(hidden_states)
-    hidden_states, router_logits = self.block_sparse_moe(hidden_states)
-    hidden_states = residual + hidden_states
-
-    outputs = (hidden_states,)
-
-    if output_attentions:
-        outputs += (self_attn_weights,)
-
-    if use_cache:
-        outputs += (present_key_value,)
-
-    if output_router_logits:
-        outputs += (router_logits,)
-
-    return outputs
-
-
-def mixtral_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    output_router_logits: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-) -> Union[Tuple, MoeModelOutputWithPast]:
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_router_logits = (
-        output_router_logits
-        if output_router_logits is not None
-        else self.config.output_router_logits
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-        )
-    if input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError(
-            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-        )
-
-    past_key_values_length = 0
-
-    if use_cache:
-        use_legacy_cache = not isinstance(past_key_values, Cache)
-        if use_legacy_cache:
-            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
-        past_key_values_length = past_key_values.get_usable_length(seq_length)
-
-    cu_seqlens = None
-    max_seqlen = None
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length,
-            seq_length + past_key_values_length,
-            dtype=torch.long,
-            device=device,
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-    else:
-        position_ids = position_ids.view(-1, seq_length).long()
-        cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
-        cu_seqlens = cu_seqlens.squeeze()
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-
-    if (
-        attention_mask is not None
-        and self._attn_implementation == "flash_attention_2"
-        and use_cache
-    ):
-        is_padding_right = attention_mask[:, -1].sum().item() != batch_size
-        if is_padding_right:
-            raise ValueError(
-                "You are attempting to perform batched generation with padding_side='right'"
-                " this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
-                " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
-            )
-
-    if self._attn_implementation == "flash_attention_2":
-        # 2d mask is passed through the layers
-        attention_mask = (
-            attention_mask
-            if (attention_mask is not None and 0 in attention_mask)
-            else None
-        )
-    else:
-        # 4d mask is passed through the layers
-        attention_mask = _prepare_4d_causal_attention_mask(
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-            sliding_window=self.config.sliding_window,
-        )
-
-    hidden_states = inputs_embeds
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            LOG.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    all_router_logits = () if output_router_logits else None
-    next_decoder_cache = None
-
-    for decoder_layer in self.layers:
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        if self.gradient_checkpointing and self.training:
-            layer_outputs = self._gradient_checkpointing_func(
-                decoder_layer.__call__,
-                hidden_states,
-                attention_mask,
-                position_ids,
-                past_key_values,
-                output_attentions,
-                output_router_logits,
-                use_cache,
-                cu_seqlens,
-                max_seqlen,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                output_router_logits=output_router_logits,
-                use_cache=use_cache,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-        if output_router_logits:
-            all_router_logits += (layer_outputs[-1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = None
-    if use_cache:
-        next_cache = (
-            next_decoder_cache.to_legacy_cache()
-            if use_legacy_cache
-            else next_decoder_cache
-        )
-
-    if not return_dict:
-        return tuple(
-            v
-            for v in [
-                hidden_states,
-                next_cache,
-                all_hidden_states,
-                all_self_attns,
-                all_router_logits,
-            ]
-            if v is not None
-        )
-
-    return MoeModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-        router_logits=all_router_logits,
-    )
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -0,0 +1,30 @@
+"""multipack patching for v2 of sample packing"""
+
+import transformers
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3
+from axolotl.monkeypatch.utils import get_unpad_data
+
+SUPPORTED_MULTIPACK_MODEL_TYPES = ["mixtral", "qwen2", "falcon", "phi"]
+
+
+def patch_for_multipack(model_type):
+    if model_type == "mixtral":
+        transformers.models.mixtral.modeling_mixtral._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+        if is_deepspeed_zero3_enabled():
+            patch_mixtral_moe_forward_zero3()
+    elif model_type == "qwen2":
+        transformers.models.qwen2.modeling_qwen2._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "falcon":
+        transformers.models.falcon.modeling_falcon._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
+    elif model_type == "phi":
+        transformers.models.phi.modeling_phi._get_unpad_data = (  # pylint: disable=protected-access
+            get_unpad_data
+        )
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -4,14 +4,16 @@ import json
 import logging
 import os.path
 import shutil
+from functools import partial
 from pathlib import Path
-from typing import Dict, List, Sequence
+from typing import Dict, List, Sequence, Union

 import bitsandbytes as bnb
 import peft
 import safetensors.torch as st
 import torch
 from huggingface_hub import snapshot_download
+from torch.distributed.optim import ZeroRedundancyOptimizer
 from torch.optim.lr_scheduler import LRScheduler
 from torch.optim.optimizer import Optimizer
 from transformers import (
@@ -23,23 +25,50 @@ from transformers import (
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_main_process
+from axolotl.utils.distributed import barrier, is_main_process

 LOG = logging.getLogger("axolotl.relora")


-def reset_optimizer(optimizer: torch.optim.Optimizer):
-    for group in optimizer.param_groups:
-        for param in group["params"]:
-            param_state = optimizer.state[param]
-            for key in param_state:
-                if "qmap" in key:
-                    continue
+@torch.no_grad()
+def magnitude_pruning_(tensor, prune_ratio):
+    tensor_magnitude = torch.abs(tensor)
+    threshold = torch.quantile(
+        tensor_magnitude.flatten().to(dtype=torch.float32), prune_ratio
+    ).to(dtype=tensor.dtype)

-                if key == "step" and isinstance(param_state[key], int):
-                    param_state[key] = 0
-                else:
-                    param_state[key] = torch.zeros_like(param_state[key])
+    mask = tensor_magnitude > threshold
+    tensor.mul_(mask.to(dtype=tensor.dtype))
+
+
+def reset_optimizer(
+    optimizer: torch.optim.Optimizer,
+    *,
+    reset_params: list[str],  # where str is the key to a torch.nn.Parameter
+    optimizer_state_keys: list[str],
+):
+    pruning_fn = partial(magnitude_pruning_, prune_ratio=0.9)
+    n_zeros = 0
+    n_total = 0
+
+    optimizer_state = optimizer.state
+    if isinstance(optimizer, ZeroRedundancyOptimizer):
+        optimizer_state = optimizer.optim.state
+
+    for param in reset_params:
+        param_state = optimizer_state[param]
+        if len(param_state) == 0:  # no state for this param, happens for ZeRo optimizer
+            continue
+        for key in optimizer_state_keys:
+            pruning_fn(
+                param_state[key]
+            )  # pruning fn has to be inplace to keep the same keys in the dict
+            n_total += param_state[key].numel()
+            n_zeros += torch.sum(param_state[key] == 0).item()
+
+    _zeroed = n_zeros / (1e-7 + n_total) * 100
+    LOG.info(f"Percent of optimizer states zeroed: {_zeroed:.2f}")
+    LOG.info(f"absolute n of optimizer states zeroed: {n_zeros}")


 class ReLoRACallback(TrainerCallback):
@@ -97,6 +126,25 @@ class ReLoRACallback(TrainerCallback):
                "relora",
            )

+            if "adam" in args.optim.lower():
+                optimizer_state_keys = ["exp_avg", "exp_avg_sq"]
+            else:
+                raise ValueError(f"Optimizer {args.optim} not supported with ReLoRA")
+
+            lora_params = [
+                n
+                for n, p in model.named_parameters()
+                if p.requires_grad and "lora_" in n
+            ]
+
+            model.save_pretrained(
+                os.path.join(
+                    args.output_dir,
+                    f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+                    "adapter",
+                ),
+                safe_serialization=True,
+            )
            with torch.no_grad():
                merge_and_save(
                    model,
@@ -107,7 +155,11 @@ class ReLoRACallback(TrainerCallback):
                    actually_save=is_main_process(),
                    cpu_offload=self.cpu_offload,
                )
-                reset_optimizer(optimizer)
+                reset_optimizer(
+                    optimizer,
+                    reset_params=lora_params,
+                    optimizer_state_keys=optimizer_state_keys,
+                )

            if self.quantized:
                self.last_full_model = checkpoint_folder
@@ -197,11 +249,13 @@ class ReLoRAScheduler(LRScheduler):
        inner_schedule: LRScheduler,
        relora_steps: int,
        warmup_steps: int,
+        anneal_steps: int = 1,
        min_lr_scale: float = 0.001,
    ) -> None:
        self.inner_schedule = inner_schedule
        self.relora_steps = relora_steps
        self.warmup_steps = warmup_steps
+        self.anneal_steps = anneal_steps
        self.min_lr_scale = min_lr_scale
        super().__init__(optimizer, inner_schedule.last_epoch, inner_schedule.verbose)

@@ -210,10 +264,20 @@ class ReLoRAScheduler(LRScheduler):

        original = self.inner_schedule.get_lr()
        step = self.last_epoch
+
        if step < self.relora_steps:
            scale = 1
        else:
-            cycle_t = min(1.0, (step % self.relora_steps) / self.warmup_steps)
+            per_relora_progress = step % self.relora_steps
+            if per_relora_progress < self.warmup_steps:
+                cycle_t = min(1.0, (per_relora_progress) / self.warmup_steps)
+            elif per_relora_progress > (self.relora_steps - self.anneal_steps):
+                cycle_t = min(
+                    1.0,
+                    (self.relora_steps - per_relora_progress) / self.anneal_steps,
+                )
+            else:
+                cycle_t = 1
            scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale

        if isinstance(original, Sequence):
@@ -238,7 +302,11 @@ def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]:

 def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor:
    if isinstance(layer, (peft.tuners.lora.Linear8bitLt, peft.tuners.lora.Linear4bit)):
-        adapter = layer.active_adapter
+        adapter: Union[List[str], str] = layer.active_adapter
+        if isinstance(adapter, list):
+            if len(adapter) > 1:
+                raise ValueError("unhandled relora for multiple adapters")
+            adapter = adapter[0]
        return (
            peft.utils.transpose(
                layer.lora_B[adapter].weight.detach().to(device)
@@ -248,7 +316,7 @@ def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor
            * layer.scaling[adapter]
        )

-    return layer.get_delta_weight().to(device)
+    raise ValueError("unhandled lora layer type")


 def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraLayer]:
@@ -273,9 +341,9 @@ def update_weights(
 ):
    if reinit:
        for adapter_name in target.lora_A:
-            target.reset_lora_parameters(adapter_name)
+            target.reset_lora_parameters(adapter_name, True)
        for adapter_name in target.lora_embedding_A:
-            target.reset_lora_parameters(adapter_name)
+            target.reset_lora_parameters(adapter_name, True)

    if isinstance(target, peft.tuners.lora.Linear4bit):
        # This could be faster, but the quantization of Linear4bit weights occurs
@@ -286,7 +354,9 @@ def update_weights(
        target.weight.data = new_weight.cpu()
        target.to(device)
    elif isinstance(target, peft.tuners.lora.Linear8bitLt):
-        target.weight = bnb.nn.Int8Params(new_weight, requires_grad=False).to(device)
+        target.weight.data = (
+            bnb.nn.Int8Params(new_weight, requires_grad=False).to(device).data
+        )
    else:
        target.weight.data = new_weight.to(device)

@@ -304,14 +374,17 @@ def merge_and_save(

    if not quantized:
        for module_name, target in modules.items():
-            update = target.get_delta_weight(target.active_adapter).detach()
+            active_adapter = target.active_adapter
+            if isinstance(active_adapter, list):
+                active_adapter = active_adapter[0]
+            update = target.get_delta_weight(active_adapter).detach()
            target.weight.data += update

            if reinit:
                for adapter_name in target.lora_A:
-                    target.reset_lora_parameters(adapter_name)
+                    target.reset_lora_parameters(adapter_name, True)
                for adapter_name in target.lora_embedding_A:
-                    target.reset_lora_parameters(adapter_name)
+                    target.reset_lora_parameters(adapter_name, True)
        return

    os.makedirs(model_dst, exist_ok=True)
@@ -363,6 +436,7 @@ def merge_and_save(
            LOG.info(f"saving tensors to {shard_fn}")
            st.save_file(out_tensors, shard_fn, metadata={"format": "pt"})

+        barrier()
        del in_tensors
        del out_tensors
        torch.cuda.empty_cache()
--- a/src/axolotl/monkeypatch/utils.py
+++ b/src/axolotl/monkeypatch/utils.py
@@ -1,7 +1,48 @@
 """
 Shared utils for the monkeypatches
 """
+from typing import Optional
+
 import torch
+import torch.nn.functional as F
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
+from transformers.utils import is_torch_bf16_gpu_available
+
+
+@torch.jit.script
+def get_max_seqlen_in_batch(attention_mask: torch.Tensor) -> torch.Tensor:
+    max_num = int(torch.max(attention_mask).item())
+    batch_size, _ = attention_mask.shape
+    counts = torch.zeros((batch_size, max_num), dtype=torch.int32)
+
+    for i in range(1, max_num + 1):
+        mask = attention_mask == i
+        counts[:, i - 1] = torch.sum(mask, dim=-1).to(dtype=torch.int32)
+
+    result = counts.flatten()
+    nonzero_indices = torch.nonzero(result).squeeze(-1)
+    return result[nonzero_indices]
+
+
+@torch.jit.script
+def get_unpad_data(attention_mask: torch.Tensor):
+    device = attention_mask.device
+    seqlens_in_batch = get_max_seqlen_in_batch(attention_mask)
+    indices = torch.nonzero(attention_mask.flatten()).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = (
+        F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
+        .to(device=device)
+        .detach()
+    )
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )


 def get_cu_seqlens(attn_mask):
@@ -55,7 +96,6 @@ def get_cu_seqlens(attn_mask):
    return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)


-@torch.jit.script
 def get_cu_seqlens_from_pos_ids(position_ids):
    """generate a cumulative sequence length mask for flash attention using pos ids"""
    if len(position_ids.shape) == 1:
@@ -101,7 +141,18 @@ def get_cu_seqlens_from_pos_ids(position_ids):
        results.append(cu_seqlens)
        max_seq_lens.append(max_seq_len)

-    return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)
+    # Find the maximum value across all tensors
+    max_value = max(t.max() for t in results)
+
+    # Find the length of the longest tensor
+    max_length = max(t.size(0) for t in results)
+
+    # Pad each tensor to the same length and collect them in a list
+    padded_results = [
+        F.pad(t, (0, max_length - t.size(0)), "constant", max_value) for t in results
+    ]
+
+    return torch.stack(padded_results).to(dtype=torch.int32), torch.stack(max_seq_lens)


 def set_module_name(model, name, value):
@@ -115,3 +166,62 @@ def set_module_name(model, name, value):
        child_name = name

    setattr(parent, child_name, value)
+
+
+def mask_2d_to_4d(
+    mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None
+):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    This expansion handles packed sequences so that sequences share the same attention mask integer value
+    when they attend to each other within that sequence.
+    This expansion transforms the mask to lower triangular form to prevent future peeking.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    mask = mask.unsqueeze(1).unsqueeze(2)
+    mask = mask.expand(bsz, 1, tgt_len, src_len)
+
+    # Create a binary mask from the original mask where zeros remain zeros and all other values are set to one
+    binary_mask = torch.where(
+        mask != 0,
+        torch.tensor(1, device=mask.device).to(dtype),
+        torch.tensor(0, device=mask.device).to(dtype),
+    )
+
+    # Create a block-diagonal mask.
+    # we multiply by the binary mask so that 0's in the original mask are correctly excluded
+    zero_one_mask = torch.eq(mask, mask.transpose(-1, -2)).int() * binary_mask
+
+    # Now let's create a lower triangular mask of ones that will zero out the upper triangular part
+    lower_triangular_ones = torch.tril(torch.ones((tgt_len, src_len), dtype=dtype)).to(
+        mask.device
+    )
+
+    # Use the lower triangular mask to zero out the upper triangular part of the zero_one_mask
+    masked_zero_one_mask = zero_one_mask * lower_triangular_ones
+
+    return masked_zero_one_mask
+
+
+def patched_prepare_4d_causal_attention_mask(
+    attention_mask: Optional[torch.Tensor],
+    *args,
+):
+    dtype = torch.bfloat16 if is_torch_bf16_gpu_available() else torch.float32
+    return _prepare_4d_causal_attention_mask(
+        mask_2d_to_4d(attention_mask, dtype=dtype),
+        *args,
+    )
+
+
+def patched_prepare_4d_causal_attention_mask_for_sdpa(
+    attention_mask: Optional[torch.Tensor],
+    *args,
+):
+    dtype = torch.bfloat16 if is_torch_bf16_gpu_available() else torch.float32
+    return _prepare_4d_causal_attention_mask_for_sdpa(
+        mask_2d_to_4d(attention_mask, dtype=dtype),
+        *args,
+    )
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -0,0 +1,67 @@
+from typing import Optional, Dict, Any
+
+from axolotl.prompt_tokenizers import PromptTokenizingStrategy
+from axolotl.prompters import Prompter
+from axolotl.utils.chat_templates import chat_templates
+
+
+class ChatTemplatePrompter(Prompter):
+    def __init__(self, tokenizer, chat_template=None, max_length=2048):
+        self.tokenizer = tokenizer
+        self.chat_template = chat_template
+        self.max_length = max_length
+
+    def build_prompt(self, conversation, add_generation_prompt=False):
+        return self.tokenizer.apply_chat_template(
+            conversation, truncation=True, max_length=self.max_length,
+            add_generation_prompt=add_generation_prompt,
+            chat_template=self.chat_template,
+        )
+
+
+class ChatTemplateStrategy(PromptTokenizingStrategy):
+    """
+    Tokenizing strategy for instruction-based prompts.
+    """
+
+    def tokenize_prompt(self, prompt):
+        turns = self.get_conversation_thread(prompt)
+        prompt_ids = self.prompter.build_prompt([turns[0]], add_generation_prompt=True)
+        input_ids = self.prompter.build_prompt(turns)
+
+        if not self.train_on_inputs:
+            user_prompt_len = len(prompt_ids)
+            labels = [-100] * user_prompt_len + input_ids[user_prompt_len:]
+        else:
+            labels = input_ids
+
+
+        tokenized_prompt = {
+            "input_ids": input_ids,
+            "labels": labels,
+            "attention_mask": [1] * len(input_ids)
+        }
+
+        return tokenized_prompt
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["conversations"]
+        # remap roles - allow for assistant turn
+        role_map = {"human": "user", "user": "user", "assistant": "assistant", "gpt": "assistant"}
+        turns = [
+            {"role": role_map[t["from"]], "content": t["value"]} for t in conversations
+        ]
+        return turns
+
+
+def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
+    strategy = ChatTemplateStrategy(
+        ChatTemplatePrompter(
+            tokenizer,
+            chat_templates(ds_cfg["conversation"]),
+        ),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    return strategy
--- a/src/axolotl/prompt_strategies/dpo/init.py
+++ b/src/axolotl/prompt_strategies/dpo/init.py
@@ -0,0 +1,21 @@
+"""
+module for DPO style dataset transform strategies
+"""
+
+import importlib
+import logging
+
+LOG = logging.getLogger("axolotl")
+
+
+def load(strategy, cfg):
+    try:
+        load_fn = strategy.split(".")[-1]
+        strategy = ".".join(strategy.split(".")[:-1])
+        mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies.dpo")
+        func = getattr(mod, load_fn)
+        load_kwargs = {}
+        return func(cfg, **load_kwargs)
+    except Exception:  # pylint: disable=broad-exception-caught
+        LOG.warning(f"unable to load strategy {strategy}")
+        return None
--- a/src/axolotl/prompt_strategies/dpo/chatml.py
+++ b/src/axolotl/prompt_strategies/dpo/chatml.py
@@ -0,0 +1,110 @@
+"""
+DPO strategies for chatml
+"""
+
+
+def argilla(
+    cfg,
+):  # pylint: disable=possibly-unused-variable,unused-argument
+    def transform_fn(sample):
+        if "system" in sample and sample["system"]:
+            sample["prompt"] = (
+                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
+                f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
+            )
+        else:
+            sample[
+                "prompt"
+            ] = f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
+        sample["chosen"] = f"{sample['chosen_response']}<|im_end|>"
+        sample["rejected"] = f"{sample['rejected_response']}<|im_end|>"
+        return sample
+
+    return transform_fn
+
+
+def icr(
+    cfg,
+):  # pylint: disable=possibly-unused-variable,unused-argument
+    """
+    chatml transforms for datasets with system, input, chosen, rejected
+    ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
+    """
+
+    def transform_fn(sample):
+        if "system" in sample and sample["system"]:
+            sample["prompt"] = (
+                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
+                f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
+            )
+        else:
+            sample[
+                "prompt"
+            ] = f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
+        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
+        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
+        return sample
+
+    return transform_fn
+
+
+def intel(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
+    """
+    For Intel Orca DPO Pairs
+    """
+
+    def transform_fn(sample):
+        if "system" in sample and sample["system"]:
+            sample["prompt"] = (
+                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
+                f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
+            )
+        else:
+            sample[
+                "prompt"
+            ] = f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
+        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
+        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
+        return sample
+
+    return transform_fn
+
+
+def prompt_pairs(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
+    def transform_fn(sample):
+        if "system" in sample and sample["system"]:
+            sample["prompt"] = (
+                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
+                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
+            )
+        else:
+            sample[
+                "prompt"
+            ] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
+        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
+        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
+        return sample
+
+    return transform_fn
+
+
+def ultra(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
+    """
+    for ultrafeedback binarized conversations
+    """
+
+    def transform_fn(sample):
+        if "system" in sample and sample["system"]:
+            sample["prompt"] = (
+                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
+                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
+            )
+        else:
+            sample[
+                "prompt"
+            ] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
+        sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>"
+        sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>"
+        return sample
+
+    return transform_fn
--- a/src/axolotl/prompt_strategies/dpo/zephyr.py
+++ b/src/axolotl/prompt_strategies/dpo/zephyr.py
@@ -0,0 +1,21 @@
+"""
+DPO strategies for zephyr
+"""
+
+
+def nectar(cfg):  # pylint: disable=possibly-unused-variable,unused-argument
+    def transform_fn(sample):
+        data = {}
+        data["prompt"] = (
+            "<|system|>\n</s>\n"
+            "<|user|>\n"
+            f"{sample['prompt']}</s>\n"
+            "<|assistant|>\n"
+        )
+        answers = sorted(sample["answers"], key=lambda x: x["rank"])
+        data["chosen"] = answers[-1]["answer"]
+        data["rejected"] = answers[-2]["answer"]
+
+        return data
+
+    return transform_fn
--- a/src/axolotl/prompt_strategies/instruct.py
+++ b/src/axolotl/prompt_strategies/instruct.py
@@ -0,0 +1,33 @@
+"""Module containing the InstructShareGPTPromptTokenizingStrategy class"""
+from typing import Any, Dict, Optional
+
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import ShareGPTPrompterV2
+
+
+def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
+    conversation = (
+        ds_cfg["conversation"] if ds_cfg and "conversation" in ds_cfg else None
+    )
+    strategy = InstructShareGPTPromptTokenizingStrategy(
+        # pylint: disable=duplicate-code
+        ShareGPTPrompterV2(
+            conversation=conversation,
+        ),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+    return strategy
+
+
+class InstructShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    basic sharegpt strategy to grab conversations from the sample row
+    """
+
+    def get_conversation_thread(self, prompt):
+        return [
+            {"from": "human", "value": prompt["instruction"]},
+            {"from": "gpt", "value": prompt["output"]},
+        ]
--- a/src/axolotl/prompt_strategies/pretrain.py
+++ b/src/axolotl/prompt_strategies/pretrain.py
@@ -0,0 +1,58 @@
+"""pretraining prompt strategies"""
+from typing import Generator
+
+from transformers import BatchEncoding
+
+from axolotl.prompt_tokenizers import PromptTokenizingStrategy
+
+
+class PretrainTokenizer:
+    """basic tokenization class for pretraining"""
+
+    def build_prompt(self, prompt) -> Generator[str, None, None]:
+        yield prompt
+
+
+class PretrainTokenizationStrategy(PromptTokenizingStrategy):
+    """handles tokenization for pretraining with strides"""
+
+    @property
+    def supports_batched(self):
+        return True
+
+    def __init__(self, *args, max_length=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        if max_length:
+            self.max_length = max_length
+
+    def _tokenize(
+        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
+    ) -> BatchEncoding:
+        res = self.tokenizer(
+            prompt,
+            truncation=True,
+            max_length=self.max_length - 1,
+            add_special_tokens=True,
+            return_overflowing_tokens=True,
+            stride=256,
+        )
+        res["input_ids"] = [
+            seq + [self.tokenizer.eos_token_id] for seq in res["input_ids"]
+        ]
+        res["attention_mask"] = [seq + [1] for seq in res["attention_mask"]]
+
+        return res
+
+    def tokenize_prompt(self, prompt):
+        return self._tokenize(prompt["text"])
+
+
+def load(tokenizer, cfg):
+    strat = PretrainTokenizationStrategy(
+        PretrainTokenizer(),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+        max_length=cfg.sequence_len * 64,
+    )
+    return strat
--- a/src/axolotl/prompt_strategies/sharegpt.py
+++ b/src/axolotl/prompt_strategies/sharegpt.py
@@ -6,16 +6,19 @@ from fastchat.conversation import Conversation, SeparatorStyle, register_conv_te
 from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
 from axolotl.prompters import ShareGPTPrompterV2

-register_conv_template(
-    Conversation(
-        name="chatml",
-        system_template="<|im_start|>system\n{system_message}",
-        system_message="You are a helpful assistant.",
-        roles=["<|im_start|>user", "<|im_start|>assistant"],
-        sep_style=SeparatorStyle.CHATML,
-        sep="<|im_end|>",
+
+def register_chatml_template(system_message=None):
+    system_message = system_message or "You are a helpful assistant."
+    register_conv_template(
+        Conversation(
+            name="chatml",
+            system_template="<|im_start|>system\n{system_message}",
+            system_message=system_message,
+            roles=["<|im_start|>user", "<|im_start|>assistant"],
+            sep_style=SeparatorStyle.CHATML,
+            sep="<|im_end|>",
+        )
    )
-)


 def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -51,7 +51,7 @@ class AlpacaPrompter(Prompter):
            self.turn_no_input_format = (
                "### Instruction:\n{instruction}\n\n### Response:\n"
            )
-            self.system_format = "### System:\n{system}\n\n"
+            self.system_format = "{system}\n\n"
        if self.prompt_style == PromptStyle.CHAT.value:
            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -11,11 +11,10 @@ import torch
 import transformers.modelcard
 from accelerate.logging import get_logger
 from datasets import Dataset
-from optimum.bettertransformer import BetterTransformer
 from peft import PeftModel
 from pkg_resources import get_distribution  # type: ignore
 from transformers import PreTrainedModel, PreTrainedTokenizer
-from transformers.deepspeed import is_deepspeed_zero3_enabled
+from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled

 from axolotl.common.cli import TrainerCliArgs
 from axolotl.logging_config import configure_logging
@@ -24,6 +23,11 @@ from axolotl.utils.freeze import freeze_parameters_except
 from axolotl.utils.models import load_model, load_tokenizer
 from axolotl.utils.trainer import setup_trainer

+try:
+    from optimum.bettertransformer import BetterTransformer
+except ImportError:
+    BetterTransformer = None
+
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
@@ -57,26 +61,6 @@ def train(
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps

-    # Load the model and tokenizer
-    msg = "loading model"
-    if cfg.adapter:
-        msg += " and peft_config..."
-    LOG.debug(msg)
-    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
-    model_ref = None
-    if cfg.rl:
-        if cfg.adapter and not cfg.rl_adapter_ref_model:
-            # use built-in trl autounwrap
-            LOG.debug("Passing model_ref: None to RL trainer")
-            model_ref = None  # explicit setting to None
-        else:
-            # load the model again for model_ref/baseline
-            model_ref, _ = load_model(
-                cfg, tokenizer, inference=cli_args.inference, reference_model=True
-            )
-
-    safe_serialization = cfg.save_safetensors is True
-
    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
        possible_checkpoints = [
            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
@@ -92,11 +76,38 @@ def train(
            )
    resume_from_checkpoint = cfg.resume_from_checkpoint

+    # Load the model and tokenizer
+    msg = "loading model"
+    if cfg.adapter:
+        msg += " and peft_config..."
+    LOG.debug(msg)
+    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
+    model.generation_config.do_sample = True
+
+    model_ref = None
+    if cfg.rl:
+        if cfg.adapter and not cfg.rl_adapter_ref_model:
+            # use built-in trl autounwrap
+            LOG.debug("Passing model_ref: None to RL trainer")
+            model_ref = None  # explicit setting to None
+        else:
+            # load the model again for model_ref/baseline
+            model_ref, _ = load_model(
+                cfg, tokenizer, inference=cli_args.inference, reference_model=True
+            )
+
+    safe_serialization = cfg.save_safetensors is True
+
    if cfg.unfrozen_parameters:
        freeze_parameters_except(model, cfg.unfrozen_parameters)

    trainer = setup_trainer(
-        cfg, train_dataset, eval_dataset, (model, model_ref), tokenizer, total_num_steps
+        cfg,
+        train_dataset,
+        eval_dataset,
+        (model, model_ref, peft_config),
+        tokenizer,
+        total_num_steps,
    )

    if hasattr(model, "config"):
@@ -117,7 +128,7 @@ def train(
    if cfg.local_rank == 0:

        def terminate_handler(_, __, model):
-            if cfg.flash_optimum:
+            if cfg.flash_optimum and BetterTransformer:
                model = BetterTransformer.reverse(model)
            model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
            sys.exit(0)
@@ -142,7 +153,10 @@ def train(
    pretrain_hooks(cfg, trainer)
    if cfg.flash_optimum:
        with torch.backends.cuda.sdp_kernel(
-            enable_flash=True, enable_math=True, enable_mem_efficient=True
+            # TODO configure these from the YAML w/ sdp_kernel_kwargs: ...
+            enable_flash=True,
+            enable_math=True,
+            enable_mem_efficient=True,
        ):
            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    else:
@@ -188,13 +202,16 @@ def train(
            state_dict=trainer.accelerator.get_state_dict(trainer.model_wrapped),
        )
    elif cfg.local_rank == 0:
-        if cfg.flash_optimum:
+        if cfg.flash_optimum and BetterTransformer:
            model = BetterTransformer.reverse(model)

        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)

    if not cfg.hub_model_id:
-        trainer.create_model_card(model_name=cfg.output_dir.lstrip("./"))
+        try:
+            trainer.create_model_card(model_name=cfg.output_dir.lstrip("./"))
+        except AttributeError:
+            pass
    elif cfg.hub_model_id:
        # defensively push to the hub to ensure the model card is updated
        trainer.push_to_hub()
--- a/src/axolotl/utils/bench.py
+++ b/src/axolotl/utils/bench.py
@@ -20,7 +20,8 @@ def check_cuda_device(default_value):
            device = kwargs.get("device", args[0] if args else None)

            if (
-                not torch.cuda.is_available()
+                device is None
+                or not torch.cuda.is_available()
                or device == "auto"
                or torch.device(device).type == "cpu"
            ):
@@ -46,6 +47,12 @@ def gpu_memory_usage_all(device=0):
    return usage, reserved - usage, max(0, smi - reserved)


+def mps_memory_usage_all():
+    usage = torch.mps.current_allocated_memory() / 1024.0**3
+    reserved = torch.mps.driver_allocated_memory() / 1024.0**3
+    return usage, reserved - usage, 0
+
+
@check_cuda_device(0.0)
 def gpu_memory_usage_smi(device=0):
    if isinstance(device, torch.device):
@@ -62,7 +69,10 @@ def gpu_memory_usage_smi(device=0):


 def log_gpu_memory_usage(log, msg, device):
-    usage, cache, misc = gpu_memory_usage_all(device)
+    if torch.backends.mps.is_available():
+        usage, cache, misc = mps_memory_usage_all()
+    else:
+        usage, cache, misc = gpu_memory_usage_all(device)
    extras = []
    if cache > 0:
        extras.append(f"+{cache:.03f}GB cache")
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -9,6 +9,7 @@ from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Dict, List

 import evaluate
+import mlflow
 import numpy as np
 import pandas as pd
 import torch
@@ -61,7 +62,7 @@ class EvalFirstStepCallback(
    ):
        if (
            args.evaluation_strategy == IntervalStrategy.STEPS
-            and args.eval_steps < 1.0
+            and (args.eval_steps < 1.0 or args.eval_steps > 1)
            and state.global_step == 1
        ):
            control.should_evaluate = True
@@ -575,3 +576,31 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
        return control
+
+
+class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
+    """Callback to save axolotl config to mlflow"""
+
+    def __init__(self, axolotl_config_path):
+        self.axolotl_config_path = axolotl_config_path
+
+    def on_train_begin(
+        self,
+        args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
+        state: TrainerState,  # pylint: disable=unused-argument
+        control: TrainerControl,
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        if is_main_process():
+            try:
+                with NamedTemporaryFile(
+                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
+                ) as temp_file:
+                    copyfile(self.axolotl_config_path, temp_file.name)
+                    mlflow.log_artifact(temp_file.name, artifact_path="")
+                    LOG.info(
+                        "The Axolotl config has been saved to the MLflow artifacts."
+                    )
+            except (FileNotFoundError, ConnectionError) as err:
+                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
+        return control
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	d465b9fd98	wip, jagged restarts	2024-02-16 14:34:08 -05:00
Maxime	fac2d98c26	Add MPS support (#1264 ) * add mps support * linter stuff * CI fixes * install packaging for various tests * Update setup.py * Revert "install packaging for various tests" This reverts commit `980e7aa44d`. * Revert "CI fixes" This reverts commit `4609e3b166`. --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-02-12 08:30:32 -05:00
Wing Lian	ea00dd0852	don't use load and push together (#1284 )	2024-02-09 14:54:31 -05:00
Hamel Husain	b2a4cb4396	Update README.md (#1281 )	2024-02-09 07:38:08 -08:00
Wing Lian	aaf54dc730	run the docker image builds and push on gh action gpu runners (#1218 )	2024-02-09 10:32:54 -05:00
Hamel Husain	9bca7db133	add support for https remote yamls (#1277 )	2024-02-08 20:02:17 -08:00
Hamel Husain	91cf4ee72c	allow remote data paths (#1278 ) * allow remote data paths * add docs about public url * only allow https * better docs * better docs	2024-02-08 15:02:35 -08:00
Wing Lian	1daecd161e	copy edits (#1276 )	2024-02-08 09:00:04 -05:00
Wing Lian	4a654b331e	Add link to axolotl cloud image on latitude (#1275 )	2024-02-08 08:50:11 -05:00
Wing Lian	5698943263	simplify haldning for newer multipack patches so they can be added in a single place (#1270 )	2024-02-07 10:46:04 -05:00
Wing Lian	411293bdca	contributor avatars (#1269 )	2024-02-07 07:09:01 -08:00
Zac Brannelly	73f1bdaa15	Fix bug preventing model_kwargs being injected (#1262 )	2024-02-07 09:38:35 -05:00
JohanWork	1c7ed26785	lock pytorch (#1247 ) [skip ci]	2024-02-06 07:48:26 -05:00
Philip May	13eea21f9b	Add more save strategies for DPO training. (#1255 ) * Set save_strategy and save_steps in HFDPOTrainerBuilder * fix doublicate save_steps	2024-02-06 00:38:43 -05:00
Chirag Jain	1072f28874	Fix typo `bloat16` -> `bfloat16` (#1257 )	2024-02-06 00:38:14 -05:00
Wing Lian	c7cf3810bd	Pretrain transforms (#1261 ) * wip for pretraining/iterable data with arbitrary prompt strategies * more fixes, wip * more fixes for custom pretraining * iterable ds wrapper not needed * remove extra features * chore: lint * update pretraning example yml * fix order for partials * fixup for tests	2024-02-06 00:37:03 -05:00
Wing Lian	8c2e05ade3	relora: magnitude pruning of the optimizer (#1245 ) * magnitude pruning of the optimizer * add alpaca chat template and fix relora patch * fix handling of lora adapter for relora * fix merge and save call * fixes for 8-bit lora merge * save intermediate checkpoint adapters * auto merge * fix eval check * handle relora annealing * fix anneal step logic * chore: lint * misx fix * fix types * Update tests/e2e/test_relora_llama.py * check for safetensors saved from relora	2024-02-06 00:35:30 -05:00
NanoCode012	2d65f470d5	fix(model): apply gate fp32 only for mixtral (#1241 ) * fix(model): apply gate fp32 only for mixtral * Update src/axolotl/utils/models.py * fix gate layer check --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-02-01 13:55:05 -05:00
Wing Lian	dfd188502a	add contact info for dedicated support for axolotl [skip ci] (#1243 )	2024-02-01 12:59:07 -05:00
Wing Lian	00568c1539	support for true batches with multipack (#1230 ) * support for true batches with multipack * patch the map dataset fetcher to handle batches with packed indexes * patch 4d mask creation for sdp attention * better handling for BetterTransformer * patch general case for 4d mask * setup forward patch. WIP * fix patch file * support for multipack w/o flash attention for llama * cleanup * add warning about bf16 vs fp16 for multipack with sdpa * bugfixes * add 4d multipack tests, refactor patches * update tests and add warnings * fix e2e file check * skip sdpa test if not at least torch 2.1.1, update docs	2024-02-01 10:18:42 -05:00
Wing Lian	c67fb71583	Peft deepspeed resume (#1227 ) * import deepspeed integration * monkeypatch peft adapater with deepspeed for resume from checkpoint * fix patch * fix patches attempt 2 * make sure to set lora_model_dir * skip pylint for deepspeed.utils * pick up upstream fix in transformers * remove monkeypatch for deepspeed/peft fix * no need to set the lora_model_dir on resume * unset load_in_bit when using quant config guard before del * better handling of load_in* kwargs	2024-01-31 18:13:29 -05:00
DreamGenX	25e037fe2d	Support for additional_special_tokens (#1221 ) [skip ci] * Support for additional_special_tokens * Support for additional_special_tokens. Adjust whitespace. * Support for additional_special_tokens. Use correct quotes. * Support for additional_special_tokens. Safe pop. * Support for additional_special_tokens. nt. * Support for additional_special_tokens. cfg.special_tokens may be None. * add token if not in vocabulary when adding additional_special_tokens * fix logic for copy/pasta * bugfix for popping from config and tokenizer reload * no need to add tokens manually now with previous bugfix --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-31 18:13:13 -05:00
Hamel Husain	52c83d30bf	Update rlhf.md (#1237 ) [skip ci]	2024-01-31 17:27:35 -05:00
Wing Lian	d113331e9a	add a helpful motd for cloud image (#1235 ) [skip ci]	2024-01-31 10:26:02 -05:00
Wing Lian	8f2b591baf	set torch version to what is installed during axolotl install (#1234 )	2024-01-31 08:47:34 -05:00
DreamGenX	5787e1a23f	Fix and document test_datasets (#1228 ) * Make sure test_dataset are used and treat val_set_size. * Add test_datasets docs. * Apply suggestions from code review --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-31 06:48:57 -05:00
xhedit	8608d8003e	Fix typo (#1231 ) [skip ci]	2024-01-31 06:46:55 -05:00
Wing Lian	4cb7900a56	Peft lotfq (#1222 ) * loftq support for lora * fix loftq check * update readme for loftq * readability cleanup * use peft main for loftq fixes, remove unnecessary special tokens * remove unused test from older deprecation	2024-01-28 18:50:08 -05:00
Filippo Broggini	18f811978c	FEAT: add tagging support to axolotl for DPOTrainer (#1209 ) * Add AxolotlDPOTrainer * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-26 20:01:57 -05:00
Wing Lian	afb5dd9655	Update FUNDING.yml [skip ci]	2024-01-26 20:00:28 -05:00
Wing Lian	8da1633124	Revert "run PR e2e docker CI tests in Modal" (#1220 ) [skip ci]	2024-01-26 16:50:44 -05:00
Wing Lian	36d053f6f0	run PR e2e docker CI tests in Modal (#1217 ) [skip ci] * wip modal for ci * handle falcon layernorms better * update * rebuild the template each time with the pseudo-ARGS * fix ref * update tests to use modal * cleanup ci script * make sure to install jinja2 also * kickoff the gh action on gh hosted runners and specify num gpus	2024-01-26 16:13:27 -05:00
JohanWork	af29d81f80	ADD: warning if hub_model_id ist set but not any save strategy (#1202 ) * warning if hub model id set but no save * add warning * move the warning * add test * allow more public methods for tests for now * fix tests --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-26 10:38:55 -05:00
Wing Lian	1b180034c7	ensure the tests use the same version of torch as the latest base docker images (#1215 ) [skip ci]	2024-01-26 10:38:30 -05:00
DreamGenX	62ca4a2b71	Respect sliding_window=None (#1214 )	2024-01-26 07:43:37 -05:00
Igor Berlenko	5407ddd233	Update qlora.yml - remove `max_packed_sequence_len` (#1210 ) [skip ci]	2024-01-26 07:43:05 -05:00
Wing Lian	74c72ca5eb	drop py39 docker images, add py311, upgrade pytorch to 2.1.2 (#1205 ) * drop py39 docker images, add py311, upgrade pytorch to 2.1.2 * also allow the main build to be manually triggered * fix workflow_dispatch in yaml	2024-01-26 00:38:49 -05:00
Wing Lian	e923e62d24	more checks and fixes for deepspeed and fsdp (#1208 ) [skip ci]	2024-01-25 20:01:45 -05:00
Wing Lian	ba944e6554	workaround for transformers bug requireing do_sample for saveing pretrained (#1206 )	2024-01-25 11:34:41 -05:00
Wing Lian	badda3783b	make sure to register the base chatml template even if no system message is provided (#1207 )	2024-01-25 10:38:08 -05:00
Wing Lian	a01b998c0f	Update deps 202401 (#1204 ) [skip ci] * update deps * xformers fix too	2024-01-25 10:11:49 -05:00
Wing Lian	33e117088f	precompute dpo logprobs setting and fixes (#1199 ) [skip ci] * add support for precompute_ref_log_probs for dpo * add chatml.icr type for argilla orca dpo * update inline doc * also set use_reentrant to false for dpo when not set * don't set use_reentrant to true for rl * make sure to set gradient checkpointing too	2024-01-25 09:31:55 -05:00
Ricardo Dominguez-Olmedo	b4ac96adef	fix learning rate scheduler's warnings (#1135 ) [skip ci] * fix schedulers warnings * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-25 07:09:34 -05:00
mhenrichsen	98b4762077	Feat/chatml add system message (#1117 ) * add system message to template * readme update * added code to register new system message * register chatml template for test --------- Co-authored-by: Mads Henrichsen <mads@BrbartiendeMads.lan> Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-25 08:24:27 +01:00
JohanWork	ee0b5f60e5	add colab example (#1196 ) [skip ci]	2024-01-24 20:09:09 -05:00
NanoCode012	08719b9609	fix(log): improve warning to clarify that lora_modules_to_save expect a list (#1197 )	2024-01-24 20:08:34 -05:00
Wing Lian	1427d5b502	prepare for release 0.4.0 (#1175 ) Some checks failed publish pypi / Upload release to PyPI (push) Has been cancelled Details	2024-01-24 15:00:28 -05:00
Wing Lian	54d2ac155b	Mixtral fixes 20240124 (#1192 ) [skip ci] * mixtral nccl fixes * make sure to patch for z3	2024-01-24 14:59:57 -05:00
Oleh Kuznetsov	af0243021c	Standardize system prompt format for AlpacaPrompter (#1190 ) [skip ci]	2024-01-24 14:27:01 -05:00
Wing Lian	8a49309489	upgrade deepspeed to 0.13.1 for mixtral fixes (#1189 ) [skip ci] * upgrade deepspeed to 0.13.1 for mixtral fixes * move deepspeed-kernels install to setup.py	2024-01-24 14:26:40 -05:00
Wing Lian	5bce45f800	more dpo fixes for dataset loading and docs (#1185 ) [skip ci] * more dpo fixes for dataset loading and docs * preprocess dpo datasets	2024-01-24 14:23:55 -05:00
Wing Lian	d85d4942cf	report min lenght of tokenized data (#1186 ) [skip ci]	2024-01-24 09:17:50 -05:00
Agung Baptiso Sorlawan	02f2c720fc	Fix generation_config validation raises Exception for do_merge_lora (#1184 )	2024-01-24 00:42:15 -05:00
James Wade	71141deb18	Add support for offline mode with HF_HUB_OFFLINE envvar (#1182 ) * Add support for offline mode with HF_HUB_OFFLINE envvar * Apply styling * chore: lint --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-24 00:41:47 -05:00
Aleksey Korshuk	dc051b861d	Update rlhf.md (#1178 ) [skip ci]	2024-01-23 15:54:51 -05:00
Wing Lian	59a31fe613	DPO fixes v2 (#1174 ) * check for length before trying to remove it * add validation for sample packing with RLHF	2024-01-23 12:56:24 -05:00
Wing Lian	814aee6603	Phi2 multipack (#1173 ) * phi2 multipack * update validation and examples for phi * more updates to phi examples * make sure to use the correct collator for phi multipack * phi needs attention mask now for multipack * if the special token already exists in the tokenizer, don't require in lora modules to save * fix qlora yml for phi, fix phi test validation * test qlora too * make sure flash attention is enabled for the test * don't use remote code for phi anymore * reduce sequence len for sample packing phi	2024-01-23 12:54:36 -05:00
Wing Lian	b715cd549a	update docs [skip ci] (#1176 )	2024-01-23 11:14:52 -05:00
Wing Lian	fb7f9b9516	don't fail if can't cast weights due to offload when merging (#1172 ) [skip ci]	2024-01-23 09:17:08 -05:00
Tilemachos Chatzipapas	cc250391a0	Fine-Tuning Mistral-7b for Real-World Chatbot Applications Using Axolotl (Lora used) (#1155 ) * Mistral-7b finetune example using axolotl with code,config,data * Corrected the path for huggingface dataset * Update data.jsonl * chore: lint --------- Co-authored-by: twenty8th <twenty8th@users.noreply.github.com> Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-23 07:32:21 -05:00
Ayush Singh	9135b9e2aa	Update README.md (#1169 ) [skip ci] Fix typo	2024-01-23 07:25:44 -05:00
Wing Lian	7523d1f557	DPO cleanup (#1126 ) * cleanup dpo to be a little more extensible, add zephyr/nectar strategy * fix eos slash * support for eval split * fix kwargs * handle empty evals * don't load peft model for dpo * ensure dpo traning args gets bf16 for peft if applicable * fix duplicate kwargs for bf16 * make sure to respect the configured lr scheduler * supprt trainer callback to push config to wandb * set dataloader preload args * ensure that we are loading the lora when merging * Update src/axolotl/utils/data.py Co-authored-by: Agus <agustin.piqueres@gmail.com> * support local datasets for dpo Co-authored-by: Agus <agustin.piqueres@gmail.com> * chore: lint * dpo/kto/ipo smoke tests w lora, simplify dpo dataset type names * add split to dpo tests * fix rebase/merging error * handle edge case w logging * use accelerator for dpo datasets so it doesn't break the logger * missing args * validate checkpoint is an adapter for now * log warning when dataset strategy is not loadable --------- Co-authored-by: Agus <agustin.piqueres@gmail.com>	2024-01-23 00:40:37 -05:00
JohanWork	5439707489	Feat(test): Add tests for alpaca chatml prompt tokenizer (#1088 ) * draft for adding test for tokenizer * clean up * clean up * fix pre commit * fix pylint * Revert "fix pylint" This reverts commit `cd2cda3cda`. * add pylint exception for pytest fixture * update comments * Apply suggestions from code review Co-authored-by: NanoCode012 <kevinvong@rocketmail.com> * update spelling and import promptstyle * reaname, restrucure * clean up * add fmt:on --------- Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>	2024-01-23 13:30:26 +09:00
Casper	684038111e	Add desc to map/filter (#1162 ) * Add desc to map/filter * update descriptions --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-22 21:30:53 -05:00
Wing Lian	cda52dc32b	support for explicit test_dataset definition for evals (#786 )	2024-01-22 21:29:56 -05:00
Wing Lian	e799e08d3c	Falcon embeddings (#1149 ) [skip docker] * also fix multipack for falcon and add smoke tests * make sure to handle special tokens and added tokens for lora * fix reference to model_type * fix tests for falcon * fix stray typo * fixes for smoke tests	2024-01-22 21:01:42 -05:00
Wing Lian	0f77b8d798	add commit message option to skip docker image builds in ci (#1168 ) [skip ci]	2024-01-22 19:55:36 -05:00
Wing Lian	32580c1ca7	Vram fix attempt (#1164 ) [skip ci] * revert order of filter/drop_long step and handle calc for max_input_len only during preprocessing * revert some changes to preparing for packing to allow more flexibility * prepare dataset for packing during pre-processing step * prepare dataset hash based on sample packing too * enclose none check * just cast straight to string for ds hash	2024-01-22 19:54:54 -05:00
Wing Lian	802f9667a2	improve vram use w gradient checkpointing (#1167 ) [skip ci]	2024-01-22 19:48:22 -05:00
JohanWork	b8e5603467	Add mlflow callback for pushing config to mlflow artifacts (#1125 ) * Update callbacks.py adding callback for mlflow * Update trainer_builder.py * clean up	2024-01-22 18:44:39 -05:00
Wing Lian	782b6a4216	set fp16 to false if bf16, update bf16: auto in example YAMLs (#1122 ) [skip ci] * set fp16 to false if bf16, update bf16: auto in example YAMLs * unset fp16 so that it fallsback properly if bf16 isn't available * Update README.md [skip-ci] Co-authored-by: NanoCode012 <kevinvong@rocketmail.com> * test that bf16 disables fp16 --------- Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>	2024-01-22 18:44:01 -05:00
Wing Lian	eaaeefce55	jupyter lab fixes (#1139 ) [skip ci] * add a basic notebook for lab users in the root * update notebook and fix cors for jupyter * cell is code * fix eval batch size check * remove intro notebook	2024-01-22 18:42:40 -05:00
Wing Lian	f5a828aa20	Qwen2 (#1166 ) * qwen2 multipack support * fix qwen derived model check so it doesn't break qwen2 * fixes to ensure qwen2 packing works * bump requirements for qwen2 * requirements typo	2024-01-22 18:24:15 -05:00
Wing Lian	fccb542b47	make sure the model config loader respects the model_revision too (#1160 ) [skip-ci]	2024-01-22 13:23:14 -05:00
Wing Lian	2ce5c0d68a	Deprecate max packed sequence len (#1141 )	2024-01-20 05:11:50 -05:00
NanoCode012	3db5f2fd17	feat(dataset): add config to keep processed dataset in memory (#1152 )	2024-01-20 13:19:28 +09:00
Wing Lian	cbecf3e62a	fix check for env var (#1151 )	2024-01-18 23:58:11 -05:00
Wing Lian	729740df81	Dockerfile cloud ports (#1148 ) * explicitly expose ports 8888 and 22 * support for SSH_KEY from latitude	2024-01-18 22:04:25 -05:00
Joe Cummings	08b8ba09a5	Fix link for Minotaur model (#1146 ) [skip-ci]	2024-01-18 17:22:04 -05:00
Wing Lian	6910e6a8ca	Multipack simplify for Mixtral (#1142 )	2024-01-18 16:23:49 -05:00
Joe Cummings	1d70f24b50	Add shifted sparse attention (#973 ) [skip-ci] * Add s2_attn to hijack flash code * Refactor code to account for s2_attn * Add test for models utils * Add ``s2_attention`` option to llama configs * Add ``s2_attention`` option to README config * Format code to appease linter * chore: lint * Remove xpos and llama-landmark [bad merge] * add e2e smoke tests for shifted sparse attention * remove stray patch from merge * update yml with link to paper for s2_attention/longlora * fix assertion check for full fine tune * increase sequence len for tests and PR feedback updates * reduce context len to 16k for tests * reduce context len to 16k for tests * reduce batch size for larger context len and udpate test to check message * fix test for message --------- Co-authored-by: joecummings <jrcummings@devvm050.nha0.facebook.com> Co-authored-by: Wing Lian <wing.lian@gmail.com>	2024-01-18 10:16:07 -05:00
Wing Lian	317fa2555a	fix bf16 check when preprocessing data (#1140 )	2024-01-17 22:41:23 -05:00
NanoCode012	1e56b88cde	fix(preprocess): Make sure dataset not loaded from cache when using preprocess cli (#1136 )	2024-01-18 03:03:52 +09:00
Wing Lian	7570446596	Preprocess dataset size fix (#1131 ) * overwrite cache on preprocess step * don't cache the TokenizedPromptDataset at all * load_from_cache_file no longer needed	2024-01-17 11:02:41 -05:00