experiment w latent space

2023-08-18 05:47:26 -04:00
43 changed files with 966 additions and 2511 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -13,16 +13,21 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 118
+          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.9"
            pytorch: 2.0.1
            axolotl_extras:
-          - cuda: 118
+          - cuda: cu118
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.0.1
            axolotl_extras:
+          - cuda: cu118
+            cuda_version: 11.8.0
+            python_version: "3.9"
+            pytorch: 2.0.1
+            axolotl_extras: gptq
    runs-on: self-hosted
    steps:
      - name: Checkout
@@ -44,11 +49,10 @@ jobs:
        with:
          context: .
          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
-            CUDA=${{ matrix.cuda }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
  build-axolotl-runpod:
    needs: build-axolotl
@@ -68,6 +72,11 @@ jobs:
            pytorch: 2.0.1
            axolotl_extras:
            is_latest: true
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.9"
+            pytorch: 2.0.1
+            axolotl_extras: gptq
    runs-on: self-hosted
    steps:
      - name: Checkout
--- a/README.md
+++ b/README.md
@@ -16,7 +16,6 @@ Axolotl is a tool designed to streamline the fine-tuning of various AI models, o
  - [LambdaLabs Installation](#lambdalabs)
 - [Dataset](#dataset)
  - [How to Add Custom Prompts](#how-to-add-custom-prompts)
-  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
 - [Config](#config)
  - [Train](#train)
  - [Inference](#inference)
@@ -69,9 +68,8 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo

 ```bash
 git clone https://github.com/OpenAccess-AI-Collective/axolotl
-cd axolotl

-pip3 install -e .[flash-attn]
+pip3 install -e .
 pip3 install -U git+https://github.com/huggingface/peft.git

 # finetune lora
@@ -100,7 +98,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
  ```

 - Conda/Pip venv
-  1. Install python >=**3.9**
+  1. Install python **3.9**

  2. Install pytorch stable https://pytorch.org/get-started/locally/

@@ -153,7 +151,9 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \

  pip3 install -e . # change depend on needs
  pip3 install protobuf==3.20.3
-  pip3 install -U --ignore-installed requests Pillow psutil scipy
+  pip3 install -U requests
+  pip3 install -U --ignore-installed psutil
+  pip3 install -U scipy
  pip3 install git+https://github.com/huggingface/peft.git # not for gptq
  ```

@@ -257,10 +257,6 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"conversations": [{"role": "...", "value": "..."}]}
  ```
- `metharme`: instruction, adds additional eos tokens
-  ```json
-  {"prompt": "...", "generation": "..."}
-  ```
 - `sharegpt_simple.load_role`: conversations where `role` is used instead of `from`
  ```json
  {"conversations": [{"role": "...", "value": "..."}]}
@@ -278,29 +274,11 @@ Have dataset(s) in one of the following format (JSONL recommended):

 #### How to add custom prompts

-Using yaml. Example:
-```yaml
-datasets:
-  - path: repo
-    type:
-      system_prompt: ""
-      no_input_format: |-
-        User: {instruction}<|end_of_turn|>
-        Assistant:
-      format: |-
-        User: {instruction}
-        {input}<|end_of_turn|>
-        Assistant:
-```
+  1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
+  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.

-Using file:
-1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
+Optionally, download some datasets, see [data/README.md](data/README.md)

-#### How to use your custom pretokenized dataset
-
- Do not pass a `type:`
- Dataset must contain `input_ids`, `attention_mask`, `labels` in columns


 ### Config
@@ -328,20 +306,11 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
      name: enron_emails
      type: completion # format from earlier

-  # huggingface repo with multiple named configurations/subsets
-  datasets:
-    - path: bigcode/commitpackft
-      name:
-        - ruby
-        - python
-        - typescript
-      type: ... # unimplemented custom format
-
  # local
  datasets:
-    - path: data.jsonl # or json
-      ds_type: json # see other options below
-      type: alpaca
+    - path: json
+      data_files: data.jsonl # or json
+      type: alpaca # format from earlier
  ```

 - loading
@@ -416,39 +385,16 @@ fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere

-# No AMP (automatic mixed precision)
-bfloat16: true # require >=ampere
-float16: true
-
 # a list of one or more datasets to finetune the model with
 datasets:
  # hf dataset repo | "json" for local dataset, make sure to fill data_files
  - path: vicgalle/alpaca-gpt4
  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
-    ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file
    data_files: # path to source data files
    shards: # number of shards to split data into
    name: # name of dataset configuration to load

-  # custom user prompt
-  - path: repo
-    type:
-      # the below are defaults. only set what's needed.
-      system_prompt: ""
-      field_system: system
-      field_instruction: instruction
-      field_output: input
-
-      # customizable to be single line or multi-line
-      system_format: "{system}"
-      # 'format' can include {input}
-      format: |-
-        User: {instruction} {input}
-        Assistant:
-      # 'no_input_format' cannot include {input}
-      no_input_format: "{instruction} "
-
 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -472,9 +418,6 @@ dataset_shard_idx:
 # the maximum length of an input to train with, this should typically be less than 2048
 # as most models have a token/context limit of 2048
 sequence_len: 2048
-# pad inputs so each step uses constant sized buffers
-# this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
-pad_to_sequence_len:
 # max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # FutureWarning: This will soon be DEPRECATED
@@ -509,12 +452,6 @@ lora_modules_to_save:
 lora_out_dir:
 lora_fan_in_fan_out: false

-# ReLoRA configuration
-# must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
-relora_steps: # number of steps per ReLoRA restart
-relora_warmup_steps: # number of per-restart warmup steps
-relora_cpu_offload: # true to perform lora weight merges on cpu during restarts, for modest gpu memory savings
-
 # wandb configuration if you're using it
 wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
 wandb_project: # your wandb project name
@@ -535,9 +472,8 @@ warmup_steps: 100
 learning_rate: 0.00003
 lr_quadratic_warmup:
 logging_steps:
-save_strategy: # set to `no` to skip checkpoint saves
 save_steps: # leave empty to save at each epoch
-eval_steps: # leave empty to eval at each epoch
+eval_steps:
 save_total_limit: # checkpoints saved at a time
 max_steps:

@@ -626,6 +562,9 @@ deepspeed:
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:

+# Set padding for data collator to 'longest'
+collator_pad_to_longest:
+
 # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
 pretraining_dataset:

@@ -645,7 +584,7 @@ strict:

 Run
 ```bash
-accelerate launch scripts/finetune.py your_config.yml
+accelerate launch scripts/finetune.py configs/your_config.yml
 ```

 #### Multi-GPU
@@ -727,9 +666,7 @@ Please reduce any below
  - `gradient_accumulation_steps`
  - `sequence_len`

-> `failed (exitcode: -9)`
-
-Usually means your system has run out of system memory.
+> `failed (exitcode: -9)` usually means your system has run out of system memory.
 Similarly, you should consider reducing the same settings as when you run out of VRAM.
 Additionally, look into upgrading your system RAM which should be simpler than GPU upgrades.

--- a/data/README.md
+++ b/data/README.md
@@ -0,0 +1,24 @@
+
+## Download some datasets
+```shell
+curl https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_gpt4.json -o data/raw/alpaca_data_gpt4.json
+curl https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -L -o data/raw/vicuna_cleaned.json
+curl https://github.com/teknium1/GPTeacher/blob/main/Instruct/gpt4-instruct-similarity-0.6-dataset.json?raw=true -L -o data/raw/gpt4-instruct-similarity-0.6-dataset.json
+curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarity_0.6-instruct-dataset.json?raw=true -L -o data/raw/roleplay-similarity_0.6-instruct-dataset.json
+```
+
+## Convert the JSON data files to JSONL.
+
+```shell
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl
+```
+---
+
+Using JSONL makes it easier to subset the data if you want a smaller training set, i.e get 2000 random examples.
+
+```shell
+shuf -n2000 data/vicuna_cleaned.jsonl > data/vicuna_cleaned.subset0.jsonl
+```
--- a/data/raw/.gitignore
+++ b/data/raw/.gitignore
@@ -0,0 +1 @@
+**
--- a/deepspeed/zero2.json
+++ b/deepspeed/zero2.json
@@ -1,46 +0,0 @@
-{
-    "zero_optimization": {
-      "stage": 2,
-      "offload_optimizer": {
-        "device": "cpu"
-      },
-      "contiguous_gradients": true,
-      "overlap_comm": true
-    },
-    "bf16": {
-      "enabled": "auto"
-    },
-    "fp16": {
-      "enabled": "auto",
-      "auto_cast": false,
-      "loss_scale": 0,
-      "initial_scale_power": 32,
-      "loss_scale_window": 1000,
-      "hysteresis": 2,
-      "min_loss_scale": 1
-    },
-    "optimizer": {
-      "type": "AdamW",
-      "params": {
-        "lr": "auto",
-        "betas": [
-          0.9,
-          0.999
-        ],
-        "eps": 1e-8,
-        "weight_decay": "auto"
-      }
-    },
-    "scheduler": {
-      "type": "WarmupDecayLR",
-      "params": {
-        "warmup_min_lr": "auto",
-        "warmup_max_lr": "auto",
-        "warmup_num_steps": "auto",
-        "total_num_steps": "auto"
-      }
-    },
-    "train_batch_size": "auto",
-    "train_micro_batch_size_per_gpu": "auto",
-    "wall_clock_breakdown": false
-}
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -11,13 +11,14 @@ RUN apt-get update && \

 WORKDIR /workspace

+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
    if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[flash-attn,gptq,$AXOLOTL_EXTRAS]; \
+        pip install -e .[$AXOLOTL_EXTRAS]; \
    else \
-        pip install -e .[flash-attn,gptq]; \
+        pip install -e .; \
    fi

 # fix so that git fetch/pull from remote works
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -31,6 +31,26 @@ WORKDIR /workspace
 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA

+
+FROM base-builder AS flash-attn-builder
+
+WORKDIR /workspace
+
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
+
+RUN git clone https://github.com/Dao-AILab/flash-attention.git && \
+    cd flash-attention && \
+    git checkout v2.0.4  && \
+    python3 setup.py bdist_wheel && \
+    cd csrc/fused_dense_lib && \
+    python3 setup.py bdist_wheel && \
+    cd ../xentropy && \
+    python3 setup.py bdist_wheel && \
+    cd ../rotary && \
+    python3 setup.py bdist_wheel && \
+    cd ../layer_norm && \
+    python3 setup.py bdist_wheel
+
 FROM base-builder AS deepspeed-builder

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
@@ -70,8 +90,13 @@ RUN mkdir -p /workspace/wheels/bitsandbytes
 COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
 COPY --from=bnb-builder /workspace/bitsandbytes/dist/bitsandbytes-*.whl wheels
 COPY --from=bnb-builder /workspace/bitsandbytes/bitsandbytes/libbitsandbytes*.so wheels/bitsandbytes
+COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy_cuda_lib-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary_emb-*.whl wheels
+COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels

-RUN pip3 install wheels/deepspeed-*.whl
+RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xentropy_cuda_lib-*.whl wheels/rotary_emb-*.whl wheels/dropout_layer_norm-*.whl
 RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -1,67 +0,0 @@
-base_model: codellama/CodeLlama-13b-hf
-base_model_config: codellama/CodeLlama-13b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./lora-out
-
-sequence_len: 100000
-sample_packing: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -1,69 +0,0 @@
-base_model: codellama/CodeLlama-13b-hf
-base_model_config: codellama/CodeLlama-13b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 100000
-sample_packing: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: paged_adamw_32bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -1,67 +0,0 @@
-base_model: codellama/CodeLlama-34b-hf
-base_model_config: codellama/CodeLlama-34b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./lora-out
-
-sequence_len: 100000
-sample_packing: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -1,69 +0,0 @@
-base_model: codellama/CodeLlama-34b-hf
-base_model_config: codellama/CodeLlama-34b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 100000
-sample_packing: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: paged_adamw_32bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -1,67 +0,0 @@
-base_model: codellama/CodeLlama-7b-hf
-base_model_config: codellama/CodeLlama-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: true
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./lora-out
-
-sequence_len: 100000
-sample_packing: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -1,69 +0,0 @@
-base_model: codellama/CodeLlama-7b-hf
-base_model_config: codellama/CodeLlama-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: CodeLlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./qlora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 100000
-sample_packing: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 3
-optimizer: paged_adamw_32bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-eval_steps: 20
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/code-llama/README.md
+++ b/examples/code-llama/README.md
@@ -1,22 +0,0 @@
-# Overview
-
-This is an example of CodeLLaMA configuration for 7b, 13b and 34b.
-
-The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
-
-The 13b variant will fit if you change these settings to these values:
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-
-The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices.
-
-```shell
-accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml
-
-```
-or
-
-```shell
-accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml
-
-```
--- a/examples/gptq-lora-7b/README.md
+++ b/examples/gptq-lora-7b/README.md
@@ -0,0 +1,8 @@
+# LLaMa 7B using LoRA
+
+This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.
+
+```shell
+accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml
+
+```
--- a/examples/gptq-lora-7b/config.yml
+++ b/examples/gptq-lora-7b/config.yml
@@ -0,0 +1,63 @@
+base_model: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
+base_model_config: Neko-Institute-of-Science/LLaMA-7B-4bit-128g
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+trust_remote_code:
+load_in_8bit: true
+gptq: true
+datasets:
+  - path: vicgalle/alpaca-gpt4
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.02
+adapter:
+lora_model_dir:
+sequence_len: 2048
+max_packed_sequence_len:
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+lora_fan_in_fan_out: false
+wandb_project: llama-7b-lora-int4
+wandb_entity:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./llama-7b-lora-int4
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0000002
+train_on_inputs: false
+group_by_length: false
+fp16: true
+bf16: false
+tf32: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 5
+xformers_attention:
+flash_attention:
+gradient_checkpointing: true
+gptq_groupsize: 128
+gptq_model_v1: false
+warmup_steps: 20
+eval_steps: 110
+save_steps: 660
+debug:
+deepspeed:
+weight_decay: 0.0001
+fsdp:
+fsdp_config:
+tokens:
+  pad_token: "[PAD]"
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -1,76 +0,0 @@
-base_model: TheBloke/Llama-2-7B-GPTQ
-base_model_config: TheBloke/Llama-2-7B-GPTQ
-is_llama_derived_model: false
-gptq: true
-gptq_bits: 4
-model_type: AutoModelForCausalLM
-tokenizer_type: LlamaTokenizer
-tokenizer_use_fast: true
-tokenizer_legacy: true
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-push_dataset_to_hub:
-hf_use_auth_token: true
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-adapter: lora
-lora_model_dir:
-sequence_len: 4096
-sample_packing:
-lora_r: 8
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - k_proj
-  - o_proj
-  - q_proj
-  - v_proj
-lora_target_linear:
-lora_fan_in_fan_out:
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./model-out
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 3
-optimizer: adamw_torch
-adam_beta2: 0.95
-adam_eps: 0.00001
-max_grad_norm: 1.0
-torchdistx_path:
-lr_scheduler: cosine
-lr_quadratic_warmup: true
-learning_rate: 0.000017
-train_on_inputs: false
-group_by_length: false
-bf16: false
-fp16: false
-float16: true
-tf32: true
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention:
-sdp_attention:
-flash_optimum:
-gptq_groupsize:
-gptq_model_v1:
-warmup_steps: 100
-eval_steps:
-save_steps:
-debug:
-deepspeed:
-weight_decay: 0.1
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -1,73 +0,0 @@
-base_model: meta-llama/Llama-2-7b-hf
-base_model_config: meta-llama/Llama-2-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: teknium/GPT4-LLM-Cleaned
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./relora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 4096
-sample_packing: true
-
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-relora_steps: 150
-relora_warmup_steps: 10
-relora_cpu_offload: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 4
-num_epochs: 3
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16: false
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 10
-eval_steps: 20
-save_steps: 50
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -47,3 +47,4 @@ local_rank:
 gradient_checkpointing: true
 fsdp:
 fsdp_config:
+collator_pad_to_longest: true
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,26 +1,20 @@
--extra-index-url https://download.pytorch.org/whl/cu118
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-torch==2.0.1
-auto-gptq
-packaging
 peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.41.1
 accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
 addict
 fire
-PyYAML>=6.0
+PyYAML==6.0
 datasets
-flash-attn>=2.0.8
+accelerate>=0.19.0
 sentencepiece
 wandb
 einops
 xformers
 optimum
 hf_transfer
-colorama
 numba
-numpy>=1.24.4
+numpy==1.24.4
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
@@ -28,4 +22,3 @@ rouge-score==0.1.2
 scipy
 scikit-learn==1.2.2
 pynvml
-art
--- a/scripts/alpaca_json_to_jsonl.py
+++ b/scripts/alpaca_json_to_jsonl.py
@@ -0,0 +1,52 @@
+"""Module to convert json file to jsonl"""
+
+import os
+import sys
+from pathlib import Path
+from typing import Optional, Union
+
+import fire
+
+from axolotl.convert import (
+    FileReader,
+    FileWriter,
+    JsonlSerializer,
+    JsonParser,
+    JsonToJsonlConverter,
+    StdoutWriter,
+)
+from axolotl.logging_config import configure_logging
+
+configure_logging()
+
+# add src to the pythonpath so we don't need to pip install this
+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+src_dir = os.path.join(project_root, "src")
+sys.path.insert(0, src_dir)
+
+
+def main(
+    file: Path,
+    output: Optional[Path] = None,
+    to_stdout: Optional[bool] = False,
+):
+    """
+    Convert a json file to jsonl
+    """
+
+    file_reader = FileReader()
+    writer: Union[StdoutWriter, FileWriter]
+    if to_stdout or output is None:
+        writer = StdoutWriter()
+    else:
+        writer = FileWriter(output)
+    json_parser = JsonParser()
+    jsonl_serializer = JsonlSerializer()
+
+    converter = JsonToJsonlConverter(file_reader, writer, json_parser, jsonl_serializer)
+
+    converter.convert(file, output)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -6,17 +6,14 @@ import os
 import random
 import signal
 import sys
-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union

 import fire
 import torch
-import transformers
 import yaml

 # add src to the pythonpath so we don't need to pip install this
-from art import text2art
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer

@@ -25,7 +22,7 @@ from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
-from axolotl.utils.models import load_model, load_model_config, load_tokenizer
+from axolotl.utils.models import load_model, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.wandb import setup_wandb_env_vars
@@ -40,26 +37,16 @@ LOG = logging.getLogger("axolotl.scripts")
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


-@dataclass
-class TrainerCliArgs:
-    """
-    dataclass representing the various non-training arguments
-    """
+def print_axolotl_text_art():
+    ascii_art = """
+                           dP            dP   dP
+                           88            88   88
+.d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88
+88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88
+88.  .88  .d88b.  88.  .88 88 88.  .88   88   88
+`88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP
+"""

-    debug: bool = field(default=False)
-    inference: bool = field(default=False)
-    merge_lora: bool = field(default=False)
-    prepare_ds_only: bool = field(default=False)
-    prompter: Optional[str] = field(default=None)
-    shard: bool = field(default=False)
-
-
-def print_axolotl_text_art(suffix=None):
-    font = "nancyj"
-    ascii_text = "  axolotl"
-    if suffix:
-        ascii_text += f"  x  {suffix}"
-    ascii_art = text2art(" axolotl", font=font)
    if is_main_process():
        print(ascii_art)

@@ -74,8 +61,6 @@ def get_multi_line_input() -> Optional[str]:


 def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
-    if prompter == "None":
-        prompter = None
    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}

    for token, symbol in default_tokens.items():
@@ -97,8 +82,6 @@ def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
            max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
        )

-    model = model.to(cfg.device)
-
    while True:
        print("=" * 80)
        # support for multiline inputs
@@ -150,10 +133,6 @@ def choose_config(path: Path):
            "No YAML config files found in the specified directory. Are you using a .yml extension?"
        )

-    if len(yaml_files) == 1:
-        print(f"Using default YAML file '{yaml_files[0]}'")
-        return yaml_files[0]
-
    print("Choose a YAML file:")
    for idx, file in enumerate(yaml_files):
        print(f"{idx + 1}. {file}")
@@ -177,20 +156,45 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b


 def train(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
+    config: Path = Path("configs/"),
+    prepare_ds_only: bool = False,
+    **kwargs,
 ):
+    print_axolotl_text_art()
+    if Path(config).is_dir():
+        config = choose_config(config)
+
+    # load the config from the yaml file
+    with open(config, encoding="utf-8") as file:
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
+    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
+    # then overwrite the value
+    cfg_keys = cfg.keys()
+    for k, _ in kwargs.items():
+        # if not strict, allow writing to cfg even if it's not in the yml already
+        if k in cfg_keys or not cfg.strict:
+            # handle booleans
+            if isinstance(cfg[k], bool):
+                cfg[k] = bool(kwargs[k])
+            else:
+                cfg[k] = kwargs[k]
+
+    validate_config(cfg)
+
+    normalize_config(cfg)
+
+    setup_wandb_env_vars(cfg)
+
    # load the tokenizer first
    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
    tokenizer = load_tokenizer(cfg)

-    if not (
-        cli_args.shard or cli_args.merge_lora or cli_args.inference
+    if (
+        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
    ):  # don't need to load dataset for these
        train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)

-    if cli_args.debug or cfg.debug:
+    if cfg.debug or "debug" in kwargs:
        LOG.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
@@ -199,17 +203,17 @@ def train(
            tokenizer,
        )

-    if cli_args.prepare_ds_only:
+    if prepare_ds_only:
        LOG.info("Finished preparing dataset. Exiting...")
        return

    # Load the model and tokenizer
    LOG.info("loading model and (optionally) peft_config...")
-    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
+    model, peft_config = load_model(cfg, tokenizer)

    safe_serialization = cfg.save_safetensors is True

-    if cli_args.merge_lora and cfg.adapter is not None:
+    if "merge_lora" in kwargs and cfg.adapter is not None:
        LOG.info("running merge of LoRA with base model")
        model = model.merge_and_unload()
        model.to(dtype=torch.float16)
@@ -223,31 +227,21 @@ def train(
            tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
        return

-    if cli_args.inference:
-        LOG.debug("Running inference on model")
-        do_inference(cfg, model, tokenizer, prompter=cli_args.prompter)
+    if cfg.inference:
+        LOG.info("calling do_inference function")
+        prompter: Optional[str] = "AlpacaPrompter"
+        if "prompter" in kwargs:
+            if kwargs["prompter"] == "None":
+                prompter = None
+            else:
+                prompter = kwargs["prompter"]
+        do_inference(cfg, model, tokenizer, prompter=prompter)
        return

-    if cli_args.shard:
-        LOG.debug("Re-saving model w/ sharding")
+    if "shard" in kwargs:
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
        return

-    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
-        possible_checkpoints = [
-            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
-        ]
-        if len(possible_checkpoints) > 0:
-            sorted_paths = sorted(
-                possible_checkpoints,
-                key=lambda path: int(path.split("-")[-1]),
-            )
-            cfg.resume_from_checkpoint = sorted_paths[-1]
-            LOG.info(
-                f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
-            )
-    resume_from_checkpoint = cfg.resume_from_checkpoint
-
    trainer = setup_trainer(
        cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
    )
@@ -279,6 +273,20 @@ def train(
    LOG.info("Starting trainer...")
    if cfg.group_by_length:
        LOG.info("hang tight... sorting dataset for group_by_length")
+    resume_from_checkpoint = cfg.resume_from_checkpoint
+    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
+        possible_checkpoints = [
+            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
+        ]
+        if len(possible_checkpoints) > 0:
+            sorted_paths = sorted(
+                possible_checkpoints,
+                key=lambda path: int(path.split("-")[-1]),
+            )
+            resume_from_checkpoint = sorted_paths[-1]
+            LOG.info(
+                f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
+            )

    if not Path(cfg.output_dir).is_dir():
        os.makedirs(cfg.output_dir, exist_ok=True)
@@ -293,13 +301,6 @@ def train(

    LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

-    if cfg.relora_steps:
-        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
-            model = model.merge_and_unload()
-        else:
-            # final model weights have already been saved by `ReLoRACallback.on_train_end`
-            return
-
    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
    if cfg.fsdp:
@@ -307,55 +308,8 @@ def train(
    elif cfg.local_rank == 0:
        if cfg.flash_optimum:
            model = BetterTransformer.reverse(model)
-
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)


-def load_cfg(config: Path = Path("examples/"), **kwargs):
-    if Path(config).is_dir():
-        config = choose_config(config)
-
-    # load the config from the yaml file
-    with open(config, encoding="utf-8") as file:
-        cfg: DictDefault = DictDefault(yaml.safe_load(file))
-    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
-    # then overwrite the value
-    cfg_keys = cfg.keys()
-    for k, _ in kwargs.items():
-        # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or not cfg.strict:
-            # handle booleans
-            if isinstance(cfg[k], bool):
-                cfg[k] = bool(kwargs[k])
-            else:
-                cfg[k] = kwargs[k]
-
-    model_config = load_model_config(cfg)
-
-    # figure out if the model is llama
-    cfg.is_llama_derived_model = (
-        (hasattr(model_config, "model_type") and model_config.model_type == "llama")
-        or cfg.is_llama_derived_model
-        or "llama" in cfg.base_model
-        or (cfg.model_type and "llama" in cfg.model_type.lower())
-    )
-    validate_config(cfg)
-
-    normalize_config(cfg)
-
-    setup_wandb_env_vars(cfg)
-    return cfg
-
-
-def do_train(config: Path = Path("examples/"), **kwargs):
-    print_axolotl_text_art()
-    parsed_cfg = load_cfg(config, **kwargs)
-    parser = transformers.HfArgumentParser((TrainerCliArgs))
-    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
-        return_remaining_strings=True
-    )
-    train(cfg=parsed_cfg, cli_args=parsed_cli_args)
-
-
 if __name__ == "__main__":
-    fire.Fire(do_train)
+    fire.Fire(train)
--- a/setup.py
+++ b/setup.py
@@ -2,27 +2,14 @@

 from setuptools import find_packages, setup

-
-def parse_requirements():
-    _install_requires = []
-    _dependency_links = []
-    with open("./requirements.txt", encoding="utf-8") as requirements_file:
-        lines = [
-            r.strip() for r in requirements_file.readlines() if "auto-gptq" not in r
-        ]
-        for line in lines:
-            if line.startswith("--extra-index-url"):
-                # Handle custom index URLs
-                _, url = line.split()
-                _dependency_links.append(url)
-            elif "flash-attn" not in line and line and line[0] != "#":
-                # Handle standard packages
-                _install_requires.append(line)
-    return _install_requires, _dependency_links
-
-
-install_requires, dependency_links = parse_requirements()
-
+install_requires = []
+with open("./requirements.txt", encoding="utf-8") as requirements_file:
+    # don't include peft yet until we check the int4
+    # need to manually install peft for now...
+    reqs = [r.strip() for r in requirements_file.readlines() if "peft" not in r]
+    reqs = [r for r in reqs if r and r[0] != "#"]
+    for r in reqs:
+        install_requires.append(r)

 setup(
    name="axolotl",
@@ -31,15 +18,15 @@ setup(
    package_dir={"": "src"},
    packages=find_packages(),
    install_requires=install_requires,
-    dependency_links=dependency_links,
    extras_require={
        "gptq": [
-            "auto-gptq",
+            "alpaca_lora_4bit @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
        ],
-        "flash-attn": [
-            "flash-attn==2.0.8",
+        "gptq_triton": [
+            "alpaca_lora_4bit[triton] @ git+https://github.com/winglian/alpaca_lora_4bit.git@setup_pip",
        ],
        "extras": [
+            "flash-attn",
            "deepspeed",
        ],
    },
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -1,42 +1,16 @@
-"""
-Common logging module for axolotl
-"""
+"""Logging configuration settings"""

 import os
 import sys
-from logging import Formatter
 from logging.config import dictConfig
 from typing import Any, Dict

-from colorama import Fore, Style, init
-
-
-class ColorfulFormatter(Formatter):
-    """
-    Formatter to add coloring to log messages by log type
-    """
-
-    COLORS = {
-        "WARNING": Fore.YELLOW,
-        "ERROR": Fore.RED,
-        "CRITICAL": Fore.RED + Style.BRIGHT,
-    }
-
-    def format(self, record):
-        log_message = super().format(record)
-        return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
-
-
 DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
    "version": 1,
    "formatters": {
        "simple": {
            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
        },
-        "colorful": {
-            "()": ColorfulFormatter,
-            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
-        },
    },
    "filters": {},
    "handlers": {
@@ -46,25 +20,14 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
            "filters": [],
            "stream": sys.stdout,
        },
-        "color_console": {
-            "class": "logging.StreamHandler",
-            "formatter": "colorful",
-            "filters": [],
-            "stream": sys.stdout,
-        },
    },
    "root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
    "loggers": {
-        "axolotl": {
-            "handlers": ["color_console"],
-            "level": "DEBUG",
-            "propagate": False,
-        },
+        "axolotl": {"handlers": ["console"], "level": "DEBUG", "propagate": False},
    },
 }


 def configure_logging():
    """Configure with default logging"""
-    init()  # Initialize colorama
    dictConfig(DEFAULT_LOGGING_CONFIG)
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -2,47 +2,142 @@

 # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py

-import warnings
-from typing import List, Optional, Tuple, Union
+from typing import Optional, Tuple

 import torch
-import torch.nn.functional as F
 import transformers
 from einops import rearrange
 from flash_attn.bert_padding import pad_input, unpad_input
-from transformers.modeling_outputs import BaseModelOutputWithPast
-from transformers.models.llama.modeling_llama import (
-    LlamaDecoderLayer as OriginalLlamaDecoderLayer,
-)
-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
-
-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids

 try:
-    from flash_attn.flash_attn_interface import (  # pylint: disable=ungrouped-imports
-        flash_attn_kvpacked_func,
-        flash_attn_varlen_kvpacked_func,
-        flash_attn_varlen_qkvpacked_func,
-    )
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
 except ImportError:
-    from flash_attn.flash_attn_interface import (
-        flash_attn_unpadded_kvpacked_func as flash_attn_varlen_kvpacked_func,
-    )
    from flash_attn.flash_attn_interface import (
        flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
    )

+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb

-def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
-    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
-        _prepare_decoder_attention_mask
+from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
+
+
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+
+    attention_mask: [bsz, q_len]
+    """
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
    )
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = flashattn_forward
-    if packed:
-        transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
-        transformers.models.llama.modeling_llama.LlamaModel.forward = (
-            llama_model_forward
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+
+    kv_seq_len = key_states.shape[-2]
+    assert past_key_value is None, "past_key_value is not supported"
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+    assert not output_attentions, "output_attentions is not supported"
+    assert not use_cache, "use_cache is not supported"
+
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask
+
+    if key_padding_mask is None:
+        qkv = rearrange(qkv, "b s ... -> (b s) ...")
+        max_s = q_len
+        cu_q_lens = torch.arange(
+            0,
+            (bsz + 1) * q_len,
+            step=q_len,
+            dtype=torch.int32,
+            device=qkv.device,
        )
+        output = flash_attn_varlen_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+    elif attention_mask.shape[0] == 1:
+        # special handling using sample packing
+        qkv = rearrange(qkv, "b s ... -> (b s) ...")
+        cu_q_lens, max_s = get_cu_seqlens_from_pos_ids(position_ids)
+        cu_q_lens = cu_q_lens.squeeze()
+
+        output = flash_attn_varlen_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+    else:
+        nheads = qkv.shape[-2]
+
+        # pylint: disable=invalid-name
+        x = rearrange(qkv, "b s three h d -> b s (three h d)")
+        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+        x_unpad = rearrange(
+            x_unpad,
+            "nnz (three h d) -> nnz three h d",
+            three=3,
+            h=nheads,
+        )
+        output_unpad = flash_attn_varlen_qkvpacked_func(
+            x_unpad,
+            cu_q_lens,
+            max_s,
+            0.0,
+            softmax_scale=None,
+            causal=True,
+        )
+        output = rearrange(
+            pad_input(
+                rearrange(output_unpad, "nnz h d -> nnz (h d)"),
+                indices,
+                bsz,
+                q_len,
+            ),
+            "b s (h d) -> b s h d",
+            h=nheads,
+        )
+
+    return (
+        self.o_proj(rearrange(output, "b s h d -> b s (h d)")),
+        None,
+        None,
+    )


 # Disable the transformation of the attention mask in LlamaModel as the flash attention
@@ -58,541 +153,8 @@ def _prepare_decoder_attention_mask(
    return attention_mask


-def flashattn_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-    cu_seqlens: Optional[torch.Tensor] = None,
-    max_seqlen: Optional[torch.Tensor] = None,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """Input shape: Batch x Time x Channel
-
-    attention_mask: [bsz, q_len]
-    """
-    # pylint: disable=duplicate-code
-    bsz, q_len, _ = hidden_states.size()
-
-    if not hasattr(self, "pretraining_tp"):
-        self.pretraining_tp = 1
-
-    if self.pretraining_tp > 1:
-        key_value_slicing = (
-            self.num_key_value_heads * self.head_dim
-        ) // self.pretraining_tp
-        query_slices = self.q_proj.weight.split(
-            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
-        )
-        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-        query_states = [
-            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        query_states = torch.cat(query_states, dim=-1)
-
-        key_states = [
-            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        key_states = torch.cat(key_states, dim=-1)
-
-        value_states = [
-            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        value_states = torch.cat(value_states, dim=-1)
-
-    else:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(
-        bsz, q_len, self.num_heads, self.head_dim
-    ).transpose(1, 2)
-    key_states = key_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    value_states = value_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    # [bsz, q_len, nh, hd]
-    # [bsz, nh, q_len, hd]
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
+def replace_llama_attn_with_flash_attn():
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
+        _prepare_decoder_attention_mask
    )
-    # [bsz, nh, t, hd]
-
-    if past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = torch.cat([past_key_value[0], key_states], dim=2)
-        value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    if output_attentions:
-        warnings.warn(
-            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
-        )
-
-    #
-    # flash-attn v2 start
-    #
-
-    if self.training:
-        # during training q,k,v always have same seqlen
-        assert key_states.shape == query_states.shape
-        is_causal = True
-    else:
-        # turn off FA causal mask after first inference autoregressive iteration
-        # only on first autoregressive step q,k,v have same seqlen
-        is_causal = key_states.shape == query_states.shape
-
-    if cu_seqlens is not None and max_seqlen is not None:
-        # special handling using sample packing
-        qkv = torch.stack(
-            [query_states, key_states, value_states], dim=2
-        )  # [bsz, nh, 3, q_len, hd]
-        qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-        qkv = rearrange(qkv, "b s ... -> (b s) ...")
-
-        output = flash_attn_varlen_qkvpacked_func(
-            qkv, cu_seqlens, max_seqlen, 0.0, softmax_scale=None, causal=True
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-    elif query_states.shape == key_states.shape:
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        qkv_unpad, cu_seqlens_q, max_seqlen_q, _, output_pad_fn = generate_qkv(
-            query_states,
-            key_states,
-            value_states,
-            qkvpacked=True,
-            # We have disabled _prepare_decoder_attention_mask in LlamaModel
-            # the attention_mask should be the same as the key_padding_mask
-            key_padding_mask=attention_mask,
-            query_padding_mask=attention_mask[:, -query_states.size(1) :]
-            if attention_mask is not None
-            else None,
-        )
-        output_unpad = flash_attn_varlen_qkvpacked_func(
-            qkv_unpad,
-            cu_seqlens_q,
-            max_seqlen_q,
-            0.0,
-            softmax_scale=None,
-            causal=is_causal,
-        )
-        output = output_pad_fn(output_unpad)
-    else:
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-        if attention_mask is None or attention_mask.all().item():
-            output = flash_attn_kvpacked_func(
-                query_states,
-                torch.stack([key_states, value_states], 2),
-                causal=is_causal,
-            )
-        else:
-            (  # pylint: disable=unbalanced-tuple-unpacking
-                q_unpad,
-                kv_unpad,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                _,
-                _,
-                output_pad_fn,
-            ) = generate_qkv(
-                query_states,
-                key_states,
-                value_states,
-                kvpacked=True,
-                key_padding_mask=attention_mask,
-                query_padding_mask=attention_mask[:, -query_states.size(1) :]
-                if attention_mask is not None
-                else None,
-            )
-            output_unpad = flash_attn_varlen_kvpacked_func(
-                q_unpad,
-                kv_unpad,
-                cu_seqlens_q,
-                cu_seqlens_k,
-                max_seqlen_q,
-                max_seqlen_k,
-                0.0,
-                softmax_scale=None,
-                causal=is_causal,
-            )
-            output = output_pad_fn(output_unpad)
-
-    attn_output = output
-    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-    attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
-
-    #
-    # flash-attn v2 end
-    #
-
-    if self.pretraining_tp > 1:
-        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-        o_proj_slices = self.o_proj.weight.split(
-            self.hidden_size // self.pretraining_tp, dim=1
-        )
-        attn_output = sum(
-            F.linear(attn_output[i], o_proj_slices[i])
-            for i in range(self.pretraining_tp)
-        )
-    else:
-        attn_output = self.o_proj(attn_output)
-
-    return attn_output, None, past_key_value
-
-
-# based on https://github.com/Dao-AILab/flash-attention/blob/364a5b/tests/test_flash_attn.py#L38
-def generate_qkv(
-    q,
-    k,
-    v,
-    query_padding_mask=None,
-    key_padding_mask=None,
-    kvpacked=False,
-    qkvpacked=False,
-):  # pylint: disable=invalid-name,unnecessary-lambda-assignment
-    """
-    Arguments:
-        q: (batch_size, seqlen_q, nheads, d)
-        k: (batch_size, seqlen_k, nheads_k, d)
-        v: (batch_size, seqlen_k, nheads_k, d)
-        query_padding_mask: (batch_size, seqlen), bool
-        key_padding_mask: (batch_size, seqlen), bool
-    """
-    assert not (kvpacked and qkvpacked)
-    batch_size, seqlen_q, nheads, d = q.shape
-    _, seqlen_k, nheads_k, _ = k.shape
-    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
-    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
-
-    if query_padding_mask is not None:
-        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(
-            q, query_padding_mask
-        )
-
-        output_pad_fn = lambda output_unpad: pad_input(  # noqa: E731
-            output_unpad, indices_q, batch_size, seqlen_q
-        )
-
-    else:
-        q_unpad = rearrange(q, "b s h d -> (b s) h d")
-        cu_seqlens_q = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_q,
-            step=seqlen_q,
-            dtype=torch.int32,
-            device=q_unpad.device,
-        )
-        max_seqlen_q = seqlen_q
-
-        output_pad_fn = lambda output_unpad: rearrange(  # noqa: E731
-            output_unpad, "(b s) h d -> b s h d", b=batch_size
-        )
-
-    if key_padding_mask is not None:
-        k_unpad, _, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
-        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
-    else:
-        k_unpad = rearrange(k, "b s h d -> (b s) h d")
-        v_unpad = rearrange(v, "b s h d -> (b s) h d")
-        cu_seqlens_k = torch.arange(
-            0,
-            (batch_size + 1) * seqlen_k,
-            step=seqlen_k,
-            dtype=torch.int32,
-            device=k_unpad.device,
-        )
-        max_seqlen_k = seqlen_k
-
-    if qkvpacked:
-        assert nheads == nheads_k
-        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
-        qkv = torch.stack([q, k, v], dim=2)
-        return (qkv_unpad, cu_seqlens_q, max_seqlen_q, qkv, output_pad_fn)
-
-    if kvpacked:
-        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
-        kv = torch.stack([k, v], dim=2)
-        return (
-            q_unpad,
-            kv_unpad,
-            cu_seqlens_q,
-            cu_seqlens_k,
-            max_seqlen_q,
-            max_seqlen_k,
-            q,
-            kv,
-            output_pad_fn,
-        )
-
-    return (
-        q_unpad,
-        k_unpad,
-        v_unpad,
-        cu_seqlens_q,
-        cu_seqlens_k,
-        max_seqlen_q,
-        max_seqlen_k,
-        q,
-        k,
-        v,
-        output_pad_fn,
-    )
-
-
-def llama_model_forward(
-    self,
-    input_ids: torch.LongTensor = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[List[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-) -> Union[Tuple, BaseModelOutputWithPast]:
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # retrieve input_ids and inputs_embeds
-    if input_ids is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
-        )
-    if input_ids is not None:
-        batch_size, seq_length = input_ids.shape
-    elif inputs_embeds is not None:
-        batch_size, seq_length, _ = inputs_embeds.shape
-    else:
-        raise ValueError(
-            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
-        )
-
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-
-    if past_key_values is not None:
-        past_key_values_length = past_key_values[0][0].shape[2]
-        seq_length_with_past = seq_length_with_past + past_key_values_length
-
-    cu_seqlens = None
-    max_seqlen = None
-    if position_ids is None:
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-        position_ids = torch.arange(
-            past_key_values_length,
-            seq_length + past_key_values_length,
-            dtype=torch.long,
-            device=device,
-        )
-        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
-    else:
-        position_ids = position_ids.view(-1, seq_length).long()
-        cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
-        cu_seqlens = cu_seqlens.squeeze()
-
-    if inputs_embeds is None:
-        inputs_embeds = self.embed_tokens(input_ids)
-    # embed positions
-    if attention_mask is None:
-        attention_mask = torch.ones(
-            (batch_size, seq_length_with_past),
-            dtype=torch.bool,
-            device=inputs_embeds.device,
-        )
-    attention_mask = (
-        self._prepare_decoder_attention_mask(  # pylint: disable=protected-access
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-        )
-    )
-
-    hidden_states = inputs_embeds
-
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            transformers.logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-
-    # decoder layers
-    all_hidden_states = () if output_hidden_states else None
-    all_self_attns = () if output_attentions else None
-    next_decoder_cache = () if use_cache else None
-
-    for idx, decoder_layer in enumerate(self.layers):
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        past_key_value = past_key_values[idx] if past_key_values is not None else None
-
-        if self.gradient_checkpointing and self.training:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    # None for past_key_value
-                    return module(*inputs)
-
-                return custom_forward
-
-            layer_outputs = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(decoder_layer),
-                hidden_states,
-                attention_mask,
-                position_ids,
-                None,
-                output_attentions,
-                None,
-                cu_seqlens,
-                max_seqlen,
-            )
-        else:
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cu_seqlens=cu_seqlens,
-                max_seqlen=max_seqlen,
-            )
-
-        hidden_states = layer_outputs[0]
-
-        if use_cache:
-            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
-
-        if output_attentions:
-            all_self_attns += (layer_outputs[1],)
-
-    hidden_states = self.norm(hidden_states)
-
-    # add hidden states from the last decoder layer
-    if output_hidden_states:
-        all_hidden_states += (hidden_states,)
-
-    next_cache = next_decoder_cache if use_cache else None
-    if not return_dict:
-        return tuple(
-            v
-            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
-            if v is not None
-        )
-    return BaseModelOutputWithPast(
-        last_hidden_state=hidden_states,
-        past_key_values=next_cache,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attns,
-    )
-
-
-class LlamaDecoderLayer(OriginalLlamaDecoderLayer):
-    """
-    patched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cu_seqlens: Optional[torch.Tensor] = None,
-        max_seqlen: Optional[torch.Tensor] = None,
-    ) -> Tuple[
-        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
-    ]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            cu_seqlens (`torch.Tensor`, *optional*) cumulative sequence len when packing
-        """
-
-        residual = hidden_states
-
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cu_seqlens=cu_seqlens,
-            max_seqlen=max_seqlen,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        if use_cache:
-            outputs += (present_key_value,)
-
-        return outputs
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
--- a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
@@ -1,140 +0,0 @@
-"""
-Patched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention
-"""
-
-import warnings
-from typing import Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-import transformers.models.llama.modeling_llama
-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
-
-
-def hijack_llama_sdp_attention():
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = (
-        sdp_attention_forward
-    )
-
-
-def sdp_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    # pylint: disable=duplicate-code
-    bsz, q_len, _ = hidden_states.size()
-
-    if not hasattr(self, "pretraining_tp"):
-        self.pretraining_tp = 1
-
-    if self.pretraining_tp > 1:
-        key_value_slicing = (
-            self.num_key_value_heads * self.head_dim
-        ) // self.pretraining_tp
-        query_slices = self.q_proj.weight.split(
-            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
-        )
-        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
-        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
-
-        query_states = [
-            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        query_states = torch.cat(query_states, dim=-1)
-
-        key_states = [
-            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        key_states = torch.cat(key_states, dim=-1)
-
-        value_states = [
-            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
-        ]
-        value_states = torch.cat(value_states, dim=-1)
-
-    else:
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-    query_states = query_states.view(
-        bsz, q_len, self.num_heads, self.head_dim
-    ).transpose(1, 2)
-    key_states = key_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    value_states = value_states.view(
-        bsz, q_len, self.num_key_value_heads, self.head_dim
-    ).transpose(1, 2)
-    # [bsz, q_len, nh, hd]
-    # [bsz, nh, q_len, hd]
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
-    )
-    # [bsz, nh, t, hd]
-
-    if past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = torch.cat([past_key_value[0], key_states], dim=2)
-        value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-    if output_attentions:
-        warnings.warn(
-            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
-        )
-
-    #
-    # sdp-attn start
-    #
-
-    with torch.backends.cuda.sdp_kernel():
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            is_causal=False,
-        )
-
-    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
-    attn_output = attn_output.transpose(1, 2)
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    #
-    # sdp-attn end
-    #
-
-    if self.pretraining_tp > 1:
-        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
-        o_proj_slices = self.o_proj.weight.split(
-            self.hidden_size // self.pretraining_tp, dim=1
-        )
-        attn_output = sum(
-            F.linear(attn_output[i], o_proj_slices[i])
-            for i in range(self.pretraining_tp)
-        )
-    else:
-        attn_output = self.o_proj(attn_output)
-
-    return attn_output, None, past_key_value
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -3,13 +3,13 @@ Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-g
 """

 import logging
-import warnings
+import math
 from typing import Optional, Tuple

 import torch
 import torch.nn.functional as F
 import transformers.models.llama.modeling_llama
-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+from torch import nn

 try:
    import xformers.ops
@@ -21,6 +21,12 @@ def hijack_llama_attention():
    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward


+def hijack_llama_sdp_attention():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = (
+        sdp_attention_forward
+    )
+
+
 def xformers_forward(
    self,
    hidden_states: torch.Tensor,
@@ -75,15 +81,15 @@ def xformers_forward(
    value_states = value_states.view(
        bsz, q_len, self.num_key_value_heads, self.head_dim
    ).transpose(1, 2)
-    # [bsz, q_len, nh, hd]
-    # [bsz, nh, q_len, hd]

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
-
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
    # [bsz, nh, t, hd]
@@ -96,50 +102,74 @@ def xformers_forward(
    past_key_value = (key_states, value_states) if use_cache else None

    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = repeat_kv(key_states, self.num_key_value_groups)
-    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    key_states = transformers.models.llama.modeling_llama.repeat_kv(
+        key_states, self.num_key_value_groups
+    )
+    value_states = transformers.models.llama.modeling_llama.repeat_kv(
+        value_states, self.num_key_value_groups
+    )

-    if output_attentions:
-        warnings.warn(
-            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
-        )
+    # We only apply xformers optimizations if we don't need to output the whole attention matrix
+    if not output_attentions:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)

-    #
-    # xformers-attn start
-    #
-
-    query_states = query_states.transpose(1, 2)
-    key_states = key_states.transpose(1, 2)
-    value_states = value_states.transpose(1, 2)
-
-    # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
-    # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
-    if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
-        # input and output should be of form (bsz, q_len, num_heads, head_dim)
-        attn_output = xformers.ops.memory_efficient_attention(
-            query_states, key_states, value_states, attn_bias=None
-        )
+        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=None
+            )
+        else:
+            # input and output should be of form (bsz, q_len, num_heads, head_dim)
+            attn_output = xformers.ops.memory_efficient_attention(
+                query_states,
+                key_states,
+                value_states,
+                # attn_bias=attention_mask,
+                attn_bias=xformers.ops.LowerTriangularMask(),
+            )
+        attn_weights = None
    else:
-        # input and output should be of form (bsz, q_len, num_heads, head_dim)
-        attn_output = xformers.ops.memory_efficient_attention(
-            query_states,
-            key_states,
-            value_states,
-            # attn_bias=attention_mask,
-            attn_bias=xformers.ops.LowerTriangularMask(),
-        )
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        # end x-formers vs. not x-formers if-else block

-    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
-        raise ValueError(
-            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
-            f" {attn_output.size()}"
-        )
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

-    #
-    # xformers-attn end
-    #
-
    if self.pretraining_tp > 1:
        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
        o_proj_slices = self.o_proj.weight.split(
@@ -152,4 +182,103 @@ def xformers_forward(
    else:
        attn_output = self.o_proj(attn_output)

-    return attn_output, None, past_key_value
+    return attn_output, attn_weights, past_key_value
+
+
+def sdp_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    (
+        query_states,
+        key_states,
+    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # We only apply sdp attention if we don't need to output the whole attention matrix
+    if not output_attentions:
+        with torch.backends.cuda.sdp_kernel():
+            attn_output = torch.nn.functional.scaled_dot_product_attention(
+                query_states,
+                key_states,
+                value_states,
+                attn_mask=attention_mask,
+                is_causal=False,
+            )
+            attn_weights = None
+    else:
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+            attn_weights = torch.max(
+                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
+            )
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    attn_output = self.o_proj(attn_output)
+
+    return attn_output, attn_weights, past_key_value
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -1,393 +0,0 @@
-"""Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune."""
-import glob
-import json
-import logging
-import os.path
-import shutil
-from pathlib import Path
-from typing import Dict, List, Sequence
-
-import bitsandbytes as bnb
-import peft
-import safetensors.torch as st
-import torch
-from huggingface_hub import snapshot_download
-from torch.optim.lr_scheduler import LRScheduler
-from torch.optim.optimizer import Optimizer
-from transformers import (
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
-
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_main_process
-
-LOG = logging.getLogger("axolotl.relora")
-
-
-def reset_optimizer(optimizer: torch.optim.Optimizer):
-    for group in optimizer.param_groups:
-        for param in group["params"]:
-            param_state = optimizer.state[param]
-            for key in param_state:
-                if "qmap" in key:
-                    continue
-
-                if key == "step" and isinstance(param_state[key], int):
-                    param_state[key] = 0
-                else:
-                    param_state[key] = torch.zeros_like(param_state[key])
-
-
-class ReLoRACallback(TrainerCallback):
-    """Callback to merge LoRA weights into the base model and save full-weight checkpoints"""
-
-    def __init__(self, cfg: DictDefault):
-        self.relora_steps = cfg.relora_steps
-        self.cpu_offload = cfg.relora_cpu_offload
-        self.quantized = cfg.load_in_4bit or cfg.load_in_8bit
-        self.last_full_model = cfg.base_model
-        self.resume_from_checkpoint = cfg.resume_from_checkpoint
-
-        if not os.path.exists(self.last_full_model):
-            self.last_full_model = str(Path(snapshot_download(cfg.base_model)))
-
-        assert os.path.exists(
-            self.last_full_model
-        ), "for ReLORA base_model must be a local path"
-
-        self.num_lora_restarts = 0
-        self.need_full_save = False
-
-    def on_train_begin(
-        self,
-        _args: TrainingArguments,
-        _state: TrainerState,
-        control: TrainerControl,
-        model: peft.LoraModel,
-        **_kwargs,
-    ):
-        if self.resume_from_checkpoint:
-            weight_path = os.path.join(self.resume_from_checkpoint, "relora")
-            if not os.path.exists(weight_path):
-                LOG.warning(
-                    "Resuming ReLoRA from checkpoint, but no full-weight save found"
-                )
-            else:
-                LOG.info(f"Loading adjusted base weights from {weight_path}")
-                load_weight_checkpoint(model, weight_path)
-        return control
-
-    def on_step_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: peft.LoraModel,
-        optimizer: torch.optim.Optimizer,
-        **_kwargs,
-    ):
-        if state.global_step > 0 and state.global_step % self.relora_steps == 0:
-            checkpoint_folder = os.path.join(
-                args.output_dir,
-                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
-                "relora",
-            )
-
-            with torch.no_grad():
-                merge_and_save(
-                    model,
-                    self.last_full_model,
-                    checkpoint_folder,
-                    reinit=True,
-                    quantized=self.quantized,
-                    actually_save=is_main_process(),
-                    cpu_offload=self.cpu_offload,
-                )
-                reset_optimizer(optimizer)
-
-            if self.quantized:
-                self.last_full_model = checkpoint_folder
-            self.num_lora_restarts += 1
-
-        return control
-
-    def on_save(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        model: peft.LoraModel,
-        **_kwargs,
-    ):
-        checkpoint_folder = os.path.join(
-            args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "relora"
-        )
-        if (
-            state.global_step >= self.relora_steps
-            and state.global_step % self.relora_steps != 0
-        ):
-            if self.quantized:
-                if is_main_process() and self.last_full_model != checkpoint_folder:
-                    # ensure the latest full parameter save is in the latest checkpoint
-                    # folder, so that automatic pruning of checkpoints does not remove it
-                    LOG.info(f"moving last full parameter save to {checkpoint_folder}")
-                    os.makedirs(checkpoint_folder, exist_ok=True)
-                    chunks = glob.glob(
-                        f"{self.last_full_model}/model*.safetensors"
-                    ) + glob.glob(f"{self.last_full_model}/model*.index.json")
-                    for path in chunks:
-                        new_path = os.path.abspath(shutil.move(path, checkpoint_folder))
-                        try:
-                            os.symlink(new_path, path)
-                        except OSError:
-                            # probably on windows without permission to symlink
-                            pass
-
-                    self.last_full_model = checkpoint_folder
-            else:
-                model.model.save_pretrained(checkpoint_folder, safe_serialization=True)
-
-        return control
-
-    def on_log(
-        self,
-        _args: TrainingArguments,
-        _state: TrainerState,
-        control: TrainerControl,
-        logs: Dict[str, float],
-        **_kwargs,
-    ):
-        logs["num_lora_restarts"] = self.num_lora_restarts
-        return control
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        _state: TrainerState,
-        control: TrainerControl,
-        model: peft.LoraModel,
-        **_kwargs,
-    ):
-        if self.quantized:
-            # perform final merge and save
-            with torch.no_grad():
-                merge_and_save(
-                    model,
-                    self.last_full_model,
-                    args.output_dir,
-                    reinit=False,
-                    quantized=self.quantized,
-                    actually_save=is_main_process(),
-                    cpu_offload=self.cpu_offload,
-                )
-        # no need to save if unquantized, as finetune.py will call merge_and_unload()
-        return control
-
-
-class ReLoRAScheduler(LRScheduler):
-    """Wraps another scheduler to apply per-lora-restart learning rate warmups."""
-
-    def __init__(
-        self,
-        optimizer: Optimizer,
-        inner_schedule: LRScheduler,
-        relora_steps: int,
-        warmup_steps: int,
-        min_lr_scale: float = 0.001,
-    ) -> None:
-        self.inner_schedule = inner_schedule
-        self.relora_steps = relora_steps
-        self.warmup_steps = warmup_steps
-        self.min_lr_scale = min_lr_scale
-        super().__init__(optimizer, inner_schedule.last_epoch, inner_schedule.verbose)
-
-    def get_lr(self) -> float:
-        self.inner_schedule.last_epoch = self.last_epoch
-
-        original = self.inner_schedule.get_lr()
-        step = self.last_epoch
-        if step < self.relora_steps:
-            scale = 1
-        else:
-            cycle_t = min(1.0, (step % self.relora_steps) / self.warmup_steps)
-            scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale
-
-        if isinstance(original, Sequence):
-            return [lr * scale for lr in original]
-        return original * scale
-
-
-def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]:
-    model_name = "model.safetensors"
-    if not os.path.exists(str(Path(path) / model_name)) and not os.path.exists(
-        str(Path(path) / f"{model_name}.index.json")
-    ):
-        model_name = "pytorch_model.bin"
-
-    index_path = str(Path(path) / f"{model_name}.index.json")
-    if os.path.exists(index_path):
-        with open(index_path, "r", encoding="utf-8") as file:
-            data = json.load(file)
-        return data["weight_map"]
-    return {(module_name + ".weight"): model_name for module_name in module_names}
-
-
-def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor:
-    if isinstance(layer, (peft.tuners.lora.Linear8bitLt, peft.tuners.lora.Linear4bit)):
-        adapter = layer.active_adapter
-        return (
-            peft.utils.transpose(
-                layer.lora_B[adapter].weight.detach().to(device)
-                @ layer.lora_A[adapter].weight.detach().to(device),
-                getattr(layer, "fan_in_fan_out", False),
-            )
-            * layer.scaling[adapter]
-        )
-
-    return layer.get_delta_weight().to(device)
-
-
-def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraLayer]:
-    modules: Dict[str, peft.tuners.lora.LoraLayer] = {}
-
-    key_list = [key for key, _ in model.model.named_modules() if "lora" not in key]
-    for key in key_list:
-        try:
-            # pylint: disable=protected-access
-            _parent, target, _target_name = peft.utils._get_submodules(model.model, key)
-        except AttributeError:
-            continue
-
-        if isinstance(target, peft.tuners.lora.LoraLayer):
-            modules[key] = target
-
-    return modules
-
-
-def update_weights(
-    target: peft.tuners.lora.LoraLayer, new_weight: torch.Tensor, reinit: bool, device
-):
-    if reinit:
-        for adapter_name in target.lora_A:
-            target.reset_lora_parameters(adapter_name)
-        for adapter_name in target.lora_embedding_A:
-            target.reset_lora_parameters(adapter_name)
-
-    if isinstance(target, peft.tuners.lora.Linear4bit):
-        # This could be faster, but the quantization of Linear4bit weights occurs
-        # when the module is moved from cpu to gpu. Without meddling *too* deeply in
-        # PEFT's innards or maintaining a duplicate of that codepath, this is good
-        # enough for now.
-        target.weight.quant_state = None
-        target.weight.data = new_weight.cpu()
-        target.to(device)
-    elif isinstance(target, peft.tuners.lora.Linear8bitLt):
-        target.weight = bnb.nn.Int8Params(new_weight, requires_grad=False).to(device)
-    else:
-        target.weight.data = new_weight.to(device)
-
-
-def merge_and_save(
-    model: peft.LoraModel,
-    model_src: str,
-    model_dst: str,
-    reinit: bool = False,
-    quantized: bool = False,
-    cpu_offload: bool = False,
-    actually_save: bool = True,
-):
-    modules = find_lora_modules(model)
-
-    if not quantized:
-        for module_name, target in modules.items():
-            update = target.get_delta_weight(target.active_adapter).detach()
-            target.weight.data += update
-
-            if reinit:
-                for adapter_name in target.lora_A:
-                    target.reset_lora_parameters(adapter_name)
-                for adapter_name in target.lora_embedding_A:
-                    target.reset_lora_parameters(adapter_name)
-        return
-
-    os.makedirs(model_dst, exist_ok=True)
-    shard_paths = sharded_paths(model_src, modules.keys())
-    out_shard_paths = {}
-
-    unique_shards = list(set(shard_paths.values()))
-    for shard_path in unique_shards:
-        out_tensors = {}
-        if shard_path.endswith(".safetensors"):
-            in_tensors = st.load_file(str(Path(model_src) / shard_path))
-        else:
-            in_tensors = torch.load(Path(model_src) / shard_path)
-            if "state_dict" in in_tensors:
-                in_tensors = in_tensors["state_dict"]
-
-        for module_name, target in modules.items():
-            key = module_name + ".weight"
-            if key not in shard_paths or shard_paths[key] != shard_path:
-                continue
-
-            orig_weight = in_tensors[key]
-            old_dev = target.weight.device
-            math_dev = "cpu" if cpu_offload else old_dev
-
-            delta_weight = lora_delta_weight(target, math_dev)
-            new_weight = orig_weight.to(math_dev) + delta_weight
-            del delta_weight
-
-            if actually_save:
-                out_tensors[key] = new_weight.half().cpu()
-
-            update_weights(target, new_weight, reinit=reinit, device=old_dev)
-
-        if actually_save:
-            out_shard_name = shard_path
-            if out_shard_name.startswith("pytorch_model"):
-                out_shard_name = (
-                    out_shard_name.replace("pytorch_model", "model").rstrip(".bin")
-                    + ".safetensors"
-                )
-
-            for module_name in in_tensors:
-                if module_name not in out_tensors:
-                    out_tensors[module_name] = in_tensors[module_name].half()
-                out_shard_paths[module_name] = out_shard_name
-
-            shard_fn = str(Path(model_dst) / out_shard_name)
-            LOG.info(f"saving tensors to {shard_fn}")
-            st.save_file(out_tensors, shard_fn, metadata={"format": "pt"})
-
-        del in_tensors
-        del out_tensors
-        torch.cuda.empty_cache()
-
-    if actually_save and len(unique_shards) > 1:
-        with open(
-            str(Path(model_dst, "model.safetensors.index.json")), "w", encoding="utf-8"
-        ) as file:
-            json.dump({"metadata": {}, "weight_map": out_shard_paths}, file)
-
-
-def load_weight_checkpoint(model: peft.LoraModel, checkpoint_path: str):
-    modules = find_lora_modules(model)
-    shard_paths = sharded_paths(checkpoint_path, modules.keys())
-    unique_shards = list(set(shard_paths.values()))
-
-    for shard_path in unique_shards:
-        tensors = st.load_file(os.path.join(checkpoint_path, shard_path))
-
-        for module_name, target in modules.items():
-            key = module_name + ".weight"
-            if key not in shard_paths or shard_paths[key] != shard_path:
-                continue
-
-            new_weight = tensors[key]
-            update_weights(
-                target, new_weight, reinit=False, device=target.weight.device
-            )
--- a/src/axolotl/prompt_strategies/init.py
+++ b/src/axolotl/prompt_strategies/init.py
@@ -2,10 +2,8 @@

 import importlib

-from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig

-
-def load(strategy, tokenizer, cfg, ds_cfg):
+def load(strategy, tokenizer, cfg):
    try:
        load_fn = "load"
        if strategy.split(".")[-1].startswith("load_"):
@@ -13,9 +11,6 @@ def load(strategy, tokenizer, cfg, ds_cfg):
            strategy = ".".join(strategy.split(".")[:-1])
        mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies")
        func = getattr(mod, load_fn)
-        load_kwargs = {}
-        if strategy == "user_defined":
-            load_kwargs["ds_cfg"] = UserDefinedDatasetConfig(**ds_cfg)
-        return func(tokenizer, cfg, **load_kwargs)
+        return func(tokenizer, cfg)
    except Exception:  # pylint: disable=broad-exception-caught
        return None
--- a/src/axolotl/prompt_strategies/alpaca_instruct.py
+++ b/src/axolotl/prompt_strategies/alpaca_instruct.py
@@ -1,8 +1,49 @@
 """Module loading the AlpacaInstructPromptTokenizingStrategy class"""
+import logging

 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter

+LOG = logging.getLogger("axolotl.prompt_strategies.alpaca_instruct")
+
+
+class LatentSpaceAlpacaPromptTokenizingStrategy(AlpacaPromptTokenizingStrategy):
+    """
+    Overrides the tokenization to include additional padding tokens as
+    latent space on the inputs
+    """
+
+    def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):
+        # pylint: disable=duplicate-code
+        result = self.tokenizer(
+            prompt,
+            truncation=True,
+            max_length=self.sequence_len,
+            padding=False,
+            return_tensors=None,
+        )
+        if len(result["input_ids"]) == 0:
+            LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
+        if (
+            len(result["input_ids"]) > 0
+            and result["input_ids"][-1] != self.tokenizer.eos_token_id
+            and len(result["input_ids"]) < self.sequence_len
+            and add_eos_token
+        ):
+            result["input_ids"].append(self.tokenizer.eos_token_id)
+            result["attention_mask"].append(1)
+
+        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
+            result["input_ids"] = result["input_ids"][1:]
+            result["attention_mask"] = result["attention_mask"][1:]
+
+        # latent space
+        if add_eos_token and not strip_bos_token:
+            result["input_ids"].extend([self.tokenizer.pad_token_id] * 100)
+
+        result["labels"] = result["input_ids"].copy()
+        return result
+

 def load(tokenizer, cfg):
    return AlpacaPromptTokenizingStrategy(
@@ -20,3 +61,12 @@ def load_no_prompt(tokenizer, cfg):
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
+
+
+def load_latent_space(tokenizer, cfg):
+    return LatentSpaceAlpacaPromptTokenizingStrategy(
+        AlpacaPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -57,8 +57,6 @@ class SystemDataPrompter(AlpacaPrompter):
    Alpaca Style Prompter that uses system prompts from the dataset
    """

-    system_format: str = "### System:\n{system}\n\n"
-
    def build_prompt_w_system(
        self,
        system: str,
--- a/src/axolotl/prompt_strategies/metharme.py
+++ b/src/axolotl/prompt_strategies/metharme.py
@@ -1,76 +0,0 @@
-"""Module containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class"""
-
-import logging
-from typing import Tuple
-
-from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter
-
-LOG = logging.getLogger("axolotl")
-
-IGNORE_TOKEN_ID = -100
-
-# pylint: disable=duplicate-code
-
-
-class MetharmePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
-    """
-    Tokenizing strategy for the Metharme models
-    """
-
-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
-        return (prompt["prompt"], "", prompt["generation"])
-
-    def _tokenize(
-        self,
-        prompt: str,
-        add_eos_token: bool = True,
-        strip_bos_token: bool = False,
-        num_eos_tokens: int = 3,
-    ):
-        result = self.tokenizer(
-            prompt,
-            truncation=True,
-            max_length=self.sequence_len,
-            padding=False,
-            return_tensors=None,
-        )
-        if len(result["input_ids"]) == 0:
-            LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
-        # If there's already an EOS token there, subtract from the number added
-        if result["input_ids"][-1] == self.tokenizer.eos_token_id:
-            num_eos_tokens -= 1
-
-        if num_eos_tokens > 0 and add_eos_token and len(result["input_ids"]) > 0:
-            for _ in range(num_eos_tokens):
-                if len(result["input_ids"]) < self.sequence_len:
-                    result["input_ids"].append(self.tokenizer.eos_token_id)
-                    result["attention_mask"].append(1)
-
-        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
-            result["input_ids"] = result["input_ids"][1:]
-            result["attention_mask"] = result["attention_mask"][1:]
-
-        result["labels"] = result["input_ids"].copy()
-        return result
-
-
-class MetharmePrompter(AlpacaPrompter):
-    """
-    Prompter for the Metharme models.
-    """
-
-    system_prompt = ""
-    system_no_input_prompt = ""
-    system_format = ""
-    turn_format = "{instruction}"
-    turn_no_input_format = "{instruction}"
-
-    def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
-        pass
-
-
-def load(tokenizer, cfg):
-    return MetharmePromptTokenizingStrategy(
-        MetharmePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
-    )
--- a/src/axolotl/prompt_strategies/sharegpt_simple.py
+++ b/src/axolotl/prompt_strategies/sharegpt_simple.py
@@ -31,6 +31,52 @@ def load_guanaco(tokenizer, cfg):
    )


+def load_latent_space(tokenizer, cfg):
+    return LatentSpaceShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+class LatentSpaceShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    latent space padded sharegpt strategy to grab conversations from the sample row
+    """
+
+    def get_conversation_thread(self, prompt):
+        return prompt["conversations"]
+
+    def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
+        # pylint: disable=duplicate-code
+        result = self.tokenizer(
+            prompt,
+            truncation=True,
+            max_length=self.sequence_len,
+            padding=False,
+            return_tensors=None,
+        )
+        if (
+            result["input_ids"][-1] != self.tokenizer.eos_token_id
+            and len(result["input_ids"]) < self.sequence_len
+            and add_eos_token
+        ):
+            result["input_ids"].append(self.tokenizer.eos_token_id)
+            result["attention_mask"].append(1)
+
+        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
+            result["input_ids"] = result["input_ids"][1:]
+            result["attention_mask"] = result["attention_mask"][1:]
+
+        # latent space
+        if add_eos_token and not strip_bos_token:
+            result["input_ids"].extend([self.tokenizer.pad_token_id] * 100)
+
+        result["labels"] = result["input_ids"].copy()
+        return result
+
+
 class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
    """
    basic sharegpt strategy to grab conversations from the sample row
--- a/src/axolotl/prompt_strategies/user_defined.py
+++ b/src/axolotl/prompt_strategies/user_defined.py
@@ -1,98 +0,0 @@
-"""
-User Defined prompts with configuration from the YML config
-"""
-
-from dataclasses import dataclass
-from functools import partial
-from typing import Optional, Tuple
-
-from axolotl.prompt_strategies.alpaca_w_system import (
-    InstructionWSystemPromptTokenizingStrategy,
-    SystemDataPrompter,
-)
-
-
-@dataclass
-class UserDefinedDatasetConfig:
-    """
-    dataclass configuration representing a userdefined dataset type
-    """
-
-    system_prompt: str = ""
-    field_system: str = "system"
-    field_instruction: str = "instruction"
-    field_input: str = "input"
-    field_output: str = "output"
-    format: str = "{instruction} {input} "
-    no_input_format: str = "{instruction} "
-    system_format: str = "{system}"
-
-    def __getitem__(self, item):
-        return getattr(self, item)
-
-
-class UserDefinedPromptTokenizationStrategy(InstructionWSystemPromptTokenizingStrategy):
-    """
-    Prompt Tokenization Strategy for user defined prompts
-    """
-
-
-def load(tokenizer, cfg, ds_cfg: Optional[UserDefinedDatasetConfig] = None):
-    if not ds_cfg:
-        raise ValueError("Missing dataset prompt configuration")
-
-    system_prompt = ""
-    if ds_cfg.system_prompt:
-        system_prompt = ds_cfg.system_prompt
-
-    def parse_instruction_fields(
-        field_instruction,
-        field_input,
-        field_output,
-        field_system,
-        system_prompt,
-        prompt,
-    ) -> Tuple[str, str, str, str]:
-        return (
-            prompt[field_instruction],
-            prompt[field_input] if field_input in prompt else "",
-            prompt[field_output] if field_output in prompt else "",
-            prompt[field_system] if field_system in prompt else system_prompt,
-        )
-
-    turn_format = ds_cfg.format
-    turn_no_input_format = ds_cfg.no_input_format
-    system_format = ds_cfg.system_format
-
-    class UserDefinedPrompter(SystemDataPrompter):
-        """
-        Prompter for user defined prompts
-        """
-
-        def match_prompt_style(self):
-            self.turn_format = turn_format
-            self.turn_no_input_format = turn_no_input_format
-            self.system_format = system_format
-
-    prompter = UserDefinedPrompter()
-
-    strat = UserDefinedPromptTokenizationStrategy(
-        prompter,
-        tokenizer,
-        cfg.train_on_inputs,
-        cfg.sequence_len,
-    )
-
-    setattr(
-        strat,
-        "parse_instruction_fields",
-        partial(
-            parse_instruction_fields,
-            ds_cfg.field_instruction,
-            ds_cfg.field_input,
-            ds_cfg.field_output,
-            ds_cfg.field_system,
-            system_prompt,
-        ),
-    )
-    return strat
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -13,7 +13,7 @@ from axolotl.prompters import IGNORE_TOKEN_ID
 LOG = logging.getLogger("axolotl")

 IGNORE_INDEX = -100
-LLAMA_DEFAULT_PAD_TOKEN = "<pad>"  # nosec
+LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"  # nosec
 LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
 LLAMA_DEFAULT_BOS_TOKEN = "<s>"  # nosec
 LLAMA_DEFAULT_UNK_TOKEN = "<unk>"  # nosec
@@ -85,11 +85,7 @@ class PromptTokenizingStrategy(abc.ABC):
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)

-        if (
-            len(result["input_ids"]) > 0
-            and result["input_ids"][0] == self.tokenizer.bos_token_id
-            and strip_bos_token
-        ):
+        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
            result["input_ids"] = result["input_ids"][1:]
            result["attention_mask"] = result["attention_mask"][1:]

--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -26,7 +26,7 @@ class AlpacaPrompter:

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
-    system_format: str = "{system}"
+    system_format: str
    turn_format: str
    turn_no_input_format: str
    prompt_style: Optional[PromptStyle] = None
@@ -63,17 +63,13 @@ class AlpacaPrompter:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
-            res = (
-                self.system_format.format(system=self.system_prompt)
-                if self.system_prompt
-                else ""
-            ) + self.turn_format.format(instruction=instruction, input=input)
+            res = self.system_prompt + self.turn_format.format(
+                instruction=instruction, input=input
+            )
        else:
-            res = (
-                self.system_format.format(system=self.system_no_input_prompt)
-                if self.system_prompt
-                else ""
-            ) + self.turn_no_input_format.format(instruction=instruction)
+            res = self.system_no_input_prompt + self.turn_no_input_format.format(
+                instruction=instruction
+            )
        if output:
            res = f"{res}{output}"
        yield res
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -33,9 +33,7 @@ class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-
        )

        peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
-        kwargs["model"].save_pretrained(
-            peft_model_path, save_safetensors=args.save_safetensors
-        )
+        kwargs["model"].save_pretrained(peft_model_path)

        return control

--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -62,13 +62,6 @@ def normalize_config(cfg):
    else:
        torch.backends.cuda.matmul.allow_tf32 = cfg.tf32 or False

-    if cfg.bf16 or cfg.bfloat16:
-        cfg.torch_dtype = torch.bfloat16
-    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
-        cfg.torch_dtype = torch.float16
-    else:
-        cfg.torch_dtype = torch.float32
-
    log_gpu_memory_usage(LOG, "baseline", cfg.device)


@@ -97,7 +90,9 @@ def validate_config(cfg):
            "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
        )
    if cfg.load_4bit:
-        raise ValueError("cfg.load_4bit parameter has been deprecated")
+        raise ValueError(
+            "cfg.load_4bit parameter has been deprecated and replaced by cfg.gptq"
+        )

    if cfg.adapter == "qlora":
        if cfg.merge_lora:
@@ -124,19 +119,6 @@ def validate_config(cfg):
    if not cfg.load_in_8bit and cfg.adapter == "lora":
        LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")

-    if cfg.relora_steps:
-        if cfg.adapter not in ("lora", "qlora"):
-            raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")
-
-        if cfg.fsdp:
-            raise ValueError("fsdp not supported with ReLoRA")
-
-        if cfg.deepspeed:
-            raise ValueError("deepspeed not supported with ReLoRA")
-
-        if cfg.lr_scheduler == "one_cycle":
-            raise ValueError("ReLoRA is not compatible with the one_cycle scheduler")
-
    if cfg.trust_remote_code:
        LOG.warning(
            "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -41,7 +41,6 @@ from axolotl.prompters import (
    ShareGPTPrompter,
    SummarizeTLDRPrompter,
 )
-from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
 from axolotl.utils.trainer import (
    calculate_total_num_steps,
@@ -54,10 +53,9 @@ DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"

 def prepare_dataset(cfg, tokenizer):
    if not cfg.pretraining_dataset:
-        with zero_first(is_main_process()):
-            train_dataset, eval_dataset = load_prepare_datasets(
-                tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
-            )
+        train_dataset, eval_dataset = load_prepare_datasets(
+            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
+        )
    else:
        train_dataset = load_pretraining_dataset(
            cfg.pretraining_dataset,
@@ -134,17 +132,8 @@ def load_tokenized_prepared_datasets(
            seed = 42

        datasets = []
-
-        def for_d_in_datasets(dataset_configs):
-            for dataset in dataset_configs:
-                if dataset.name and isinstance(dataset.name, list):
-                    for name in dataset.name:
-                        yield DictDefault({**dataset, "name": name})
-                else:
-                    yield dataset
-
        # pylint: disable=invalid-name
-        for d in for_d_in_datasets(cfg.datasets):
+        for d in cfg.datasets:
            ds: Union[Dataset, DatasetDict] = None
            ds_from_hub = False
            try:
@@ -171,15 +160,8 @@ def load_tokenized_prepared_datasets(
                        split=None,
                    )
                elif local_path.is_file():
-                    ds_type = "json"
-                    if d.ds_type:
-                        ds_type = d.ds_type
-                    elif ".parquet" in d.path:
-                        ds_type = "parquet"
-                    elif ".arrow" in d.path:
-                        ds_type = "arrow"
                    ds = load_dataset(
-                        ds_type,
+                        "json",
                        name=d.name,
                        data_files=d.path,
                        streaming=False,
@@ -216,27 +198,13 @@ def load_tokenized_prepared_datasets(
                    )
                else:
                    ds = ds.shuffle(seed=seed).shard(num_shards=d.shards, index=0)
-
-            d_base_type = d_prompt_style = None
            d_type = d.type
-            if isinstance(d_type, str):
-                d_type_split = d_type.split(":")
-                d_base_type = d_type_split[0]
-                d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
+            d_type_split = d_type.split(":")
+            d_base_type = d_type_split[0]
+            d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
            if "train" in ds:
                ds = ds["train"]
-            if (
-                "input_ids" in ds.features
-                and "attention_mask" in ds.features
-                and "labels" in ds.features
-            ):
-                # dataset is already tokenized, just drop it straight in
-                datasets.append(ds)
-            elif isinstance(d.type, DictDefault):
-                ds_strategy = load("user_defined", tokenizer, cfg, d.type.to_dict())
-                ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
-                datasets.append(ds_wrapper)
-            elif ds_strategy := load(d.type, tokenizer, cfg, d):
+            if ds_strategy := load(d.type, tokenizer, cfg):
                ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
                datasets.append(ds_wrapper)
            elif d_base_type == "alpaca":
--- a/src/axolotl/utils/dataloader.py
+++ b/src/axolotl/utils/dataloader.py
@@ -243,18 +243,6 @@ class MultipackDistributedDataloader:
            len_remaining -= 1
            if not len_remaining:
                return
-        # yield a no-op for cases where we don't have any data left to pack
-        for i in range(0, len_remaining):
-            yield self.collate_fn(
-                [
-                    {
-                        "input_ids": [0],
-                        "labels": [-100],
-                        "attention_mask": [True],
-                        "position_ids": [0],
-                    }
-                ]
-            )

    def _len_est(self):
        lengths_sum = np.sum(self.lengths)
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -4,37 +4,32 @@
 import logging
 import math
 import os
-from typing import Optional, Tuple  # noqa: F401
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401

 import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
-from peft import PeftConfig, prepare_model_for_kbit_training
 from transformers import (  # noqa: F401
    AutoConfig,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
-    GPTQConfig,
    LlamaConfig,
    PreTrainedModel,
    PreTrainedTokenizerBase,
 )

-from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
+from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
-from axolotl.utils.dict import DictDefault

 LOG = logging.getLogger("axolotl")

+if TYPE_CHECKING:
+    from peft import PeftConfig  # noqa: F401

-def load_model_config(cfg):
-    model_config_name = cfg.base_model_config or cfg.base_model
-    trust_remote_code: bool = False or cfg.trust_remote_code
-    return AutoConfig.from_pretrained(
-        model_config_name, trust_remote_code=trust_remote_code
-    )
+    from axolotl.utils.dict import DictDefault  # noqa: F401


 def load_tokenizer(cfg):
@@ -59,18 +54,11 @@ def load_tokenizer(cfg):
        **tokenizer_kwargs,
    )

-    if (
-        tokenizer.__class__.__name__
-        in [
-            "LlamaTokenizer",
-            "LlamaTokenizerFast",
-            "CodeLlamaTokenizer",
-        ]
-        and hasattr(tokenizer, "pad_token")
-        and not tokenizer.pad_token
-    ):
-        # set a pad_token, but use eos_token so we don't add a new token
-        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
+    if tokenizer.__class__.__name__ in [
+        "LlamaTokenizer",
+        "LlamaTokenizerFast",
+    ]:
+        tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN

    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
@@ -91,10 +79,8 @@ def load_tokenizer(cfg):


 def load_model(
-    cfg: DictDefault,
-    tokenizer: PreTrainedTokenizerBase,
-    inference: bool = False,
-) -> Tuple[PreTrainedModel, Optional[PeftConfig]]:
+    cfg, tokenizer
+):  # type: (DictDefault, PreTrainedTokenizerBase) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
    """
    Load a model for a given configuration and tokenizer.
    """
@@ -104,15 +90,20 @@ def load_model(

    # TODO refactor as a kwarg
    load_in_8bit = cfg.load_in_8bit
+    cfg.is_llama_derived_model = (
+        "llama" in base_model
+        or (cfg.model_type and "llama" in cfg.model_type.lower())
+        or cfg.is_llama_derived_model
+    )

    if cfg.is_llama_derived_model and cfg.flash_attention:
-        if cfg.device not in ["mps", "cpu"] and not inference:
+        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
            from axolotl.monkeypatch.llama_attn_hijack_flash import (
                replace_llama_attn_with_flash_attn,
            )

            LOG.info("patching with flash attention")
-            replace_llama_attn_with_flash_attn(packed=cfg.sample_packing)
+            replace_llama_attn_with_flash_attn()
    elif cfg.is_llama_derived_model and cfg.xformers_attention:
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_attention,
@@ -121,7 +112,9 @@ def load_model(
        LOG.info("patching with xformers attention")
        hijack_llama_attention()
    elif cfg.is_llama_derived_model and cfg.sdp_attention:
-        from axolotl.monkeypatch.llama_attn_hijack_sdp import hijack_llama_sdp_attention
+        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
+            hijack_llama_sdp_attention,
+        )

        LOG.info("patching with sdp attention")
        hijack_llama_sdp_attention()
@@ -148,33 +141,94 @@ def load_model(
    if (
        cfg.is_llama_derived_model
        and (cfg.max_packed_sequence_len or cfg.sample_packing)
-        and not inference
+        and not cfg.inference
    ):
        from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask

        LOG.info("patching _expand_mask")
        hijack_expand_mask()

+    if cfg.bf16 or cfg.bfloat16:
+        torch_dtype = torch.bfloat16
+    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
+        torch_dtype = torch.float16
+    else:
+        torch_dtype = torch.float32
+    try:
+        if cfg.gptq:
+            from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import (
+                replace_peft_model_with_int4_lora_model,
+            )
+
+            replace_peft_model_with_int4_lora_model()
+    except Exception as err:
+        LOG.exception(err)
+        raise err
+
+    if not cfg.gptq and (
+        (cfg.adapter == "lora" and load_in_8bit)
+        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
+    ):
+        try:
+            from peft import prepare_model_for_kbit_training
+        except ImportError:
+            # For backward compatibility
+            from peft import (
+                prepare_model_for_int8_training as prepare_model_for_kbit_training,
+            )
+
    model_kwargs = {}
    if cfg.model_revision:
        model_kwargs["revision"] = cfg.model_revision
-    if cfg.gptq:
-        # TODO we should figure out how read the models config.json first
-        model_kwargs["quantization_config"] = GPTQConfig(
-            bits=cfg.gptq_bits,
-            disable_exllama=True,
-        )
    if cfg.adapter == "qlora" and cfg.load_in_4bit:
        model_kwargs["quantization_config"] = BitsAndBytesConfig(
            load_in_4bit=True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
-            bnb_4bit_compute_dtype=cfg.torch_dtype,
+            bnb_4bit_compute_dtype=torch_dtype,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
    try:
-        if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
+        if cfg.gptq and cfg.is_llama_derived_model:
+            from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
+            from huggingface_hub import snapshot_download
+
+            try:
+                snapshot_download_kwargs = {}
+                if cfg.base_model_ignore_patterns:
+                    snapshot_download_kwargs[
+                        "ignore_patterns"
+                    ] = cfg.base_model_ignore_patterns
+                cache_model_path = Path(
+                    snapshot_download(base_model, **snapshot_download_kwargs)
+                )
+                files = (
+                    list(cache_model_path.glob("*.pt"))
+                    + list(cache_model_path.glob("*.safetensors"))
+                    + list(cache_model_path.glob("*.bin"))
+                )
+                if len(files) > 0:
+                    model_path = str(files[0])
+                else:
+                    LOG.warning(
+                        "unable to find a cached model file, this will likely fail..."
+                    )
+                    model_path = str(cache_model_path)
+            except Exception:  # pylint: disable=broad-exception-caught
+                model_path = cfg.base_model
+            model, _ = load_llama_model_4bit_low_ram(
+                base_model_config if base_model_config else base_model,
+                model_path,
+                device_map=cfg.device_map,
+                half=cfg.fp16,
+                groupsize=cfg.gptq_groupsize if cfg.gptq_groupsize else -1,
+                is_v1_model=cfg.gptq_model_v1
+                if cfg.gptq_model_v1 is not None
+                else True,
+            )
+            load_in_8bit = False
+        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
            from transformers import LlamaForCausalLM

            config_kwargs = {}
@@ -190,7 +244,7 @@ def load_model(
                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
-                torch_dtype=cfg.torch_dtype,
+                torch_dtype=torch_dtype,
                **model_kwargs,
            )
        # elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
@@ -220,24 +274,15 @@ def load_model(
        #     )
        #     model.train() # sets to train instead of eval mode
        elif model_type and not cfg.trust_remote_code:
-            if cfg.gptq:
-                model = AutoModelForCausalLM.from_pretrained(
-                    base_model,
-                    device_map=cfg.device_map,
-                    torch_dtype=cfg.torch_dtype,
-                    trust_remote_code=cfg.trust_remote_code or False,
-                    **model_kwargs,
-                )
-            else:
-                model = getattr(transformers, model_type).from_pretrained(
-                    base_model,
-                    device_map=cfg.device_map,
-                    load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
-                    load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
-                    torch_dtype=cfg.torch_dtype,
-                    trust_remote_code=cfg.trust_remote_code or False,
-                    **model_kwargs,
-                )
+            model = getattr(transformers, model_type).from_pretrained(
+                base_model,
+                device_map=cfg.device_map,
+                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
+                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
+                torch_dtype=torch_dtype,
+                trust_remote_code=cfg.trust_remote_code or False,
+                **model_kwargs,
+            )
        else:
            config = AutoConfig.from_pretrained(
                base_model,
@@ -265,7 +310,7 @@ def load_model(
                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
-                torch_dtype=cfg.torch_dtype,
+                torch_dtype=torch_dtype,
                trust_remote_code=cfg.trust_remote_code or False,
                **model_kwargs,
            )
@@ -279,7 +324,7 @@ def load_model(
            device_map=cfg.device_map,
            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
            load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
-            torch_dtype=cfg.torch_dtype,
+            torch_dtype=torch_dtype,
            trust_remote_code=cfg.trust_remote_code or False,
            **model_kwargs,
        )
@@ -304,46 +349,46 @@ def load_model(
    if model.device.type == "cuda":
        log_gpu_memory_usage(LOG, "after model load", model.device)

-    # make sure these are fp32 per Ramesh et al. (2021)
-    for name, module in model.named_modules():
-        if "norm" in name:
-            module.to(torch.float32)
-        if "lm_head" in name or "embed_tokens" in name:
-            if hasattr(module, "weight"):
-                module.to(torch.float32)
-
-    needs_fa2_dtype = cfg.adapter or cfg.fsdp
-    if (cfg.adapter == "lora" and load_in_8bit) or (
-        cfg.adapter == "qlora" and cfg.load_in_4bit
+    if not cfg.gptq and (
+        (cfg.adapter == "lora" and load_in_8bit)
+        or (cfg.adapter == "qlora" and cfg.load_in_4bit)
    ):
        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
-        if cfg.gradient_checkpointing:
-            model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(
            model, use_gradient_checkpointing=cfg.gradient_checkpointing
        )
-        needs_fa2_dtype = True

-    # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
-    # convert them back to fp16/bf16 for flash-attn compatibility.
-    if needs_fa2_dtype and (cfg.flash_attention and cfg.is_llama_derived_model):
-        LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
-        for name, module in model.named_modules():
-            if "norm" in name:
-                module.to(cfg.torch_dtype)
-            if "lm_head" in name or "embed_tokens" in name:
-                if hasattr(module, "weight"):
-                    module.to(cfg.torch_dtype)
+        # LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
+        # convert them back to fp16/bf16 for flash-attn compatibility.
+        if cfg.flash_attention and cfg.is_llama_derived_model:
+            for name, module in model.named_modules():
+                if "norm" in name:
+                    module.to(torch_dtype)
+                if "lm_head" in name or "embed_tokens" in name:
+                    if hasattr(module, "weight"):
+                        module.to(torch_dtype)

    model, lora_config = load_adapter(model, cfg, cfg.adapter)

    if cfg.ddp and not load_in_8bit:
        model.to(f"cuda:{cfg.local_rank}")

+    if cfg.gptq:
+        # Scales to half
+        LOG.info("Fitting 4bit scales and zeros to half")
+        for _, module in model.named_modules():
+            if "Autograd4bitQuantLinear" in str(type(module)) or "Linear4bitLt" in str(
+                type(module)
+            ):
+                if hasattr(module, "is_v1_model") and module.is_v1_model:
+                    module.zeros = module.zeros.half()
+                module.scales = module.scales.half()
+                module.bias = module.bias.half()
+
    if (
        torch.cuda.device_count() > 1
        and int(os.getenv("WORLD_SIZE", "1")) > 1
-        and (cfg.load_in_4bit)
+        and (cfg.gptq or cfg.load_in_4bit)
    ):
        # llama is PROBABLY model parallelizable, but the default isn't that it is
        # so let's only set it for the 4bit, see
@@ -369,15 +414,15 @@ def load_model(
    return model, lora_config


-def load_adapter(model, cfg, adapter, inference=False):
-    # type: (PreTrainedModel, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+def load_adapter(model, cfg, adapter):
+    # type: (PreTrainedModel, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]

    if adapter is None:
        return model, None
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    if adapter in ["lora", "qlora"]:
-        return load_lora(model, cfg, inference=inference)
+        return load_lora(model, cfg)
    if adapter == "llama-adapter":
        return load_llama_adapter(model, cfg)

@@ -395,7 +440,7 @@ def load_llama_adapter(model, cfg):
    )

    if cfg.lora_model_dir:
-        LOG.debug("Loading pretained PEFT - llama_adapter")
+        LOG.info("Loading pretained LORA")
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
@@ -409,8 +454,12 @@ def load_llama_adapter(model, cfg):
    return model, peft_config


-def find_all_linear_names(model):
-    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
+def find_all_linear_names(bits, model):
+    cls = (
+        bnb.nn.Linear4bit
+        if bits == 4
+        else (bnb.nn.Linear8bitLt if bits == 8 else torch.nn.Linear)
+    )
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
@@ -423,15 +472,21 @@ def find_all_linear_names(model):
    return list(lora_module_names)


-def load_lora(model, cfg, inference=False):
-    # type: (PreTrainedModel, DictDefault, bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+def load_lora(model, cfg):
+    # type: (PreTrainedModel, DictDefault) -> Tuple[PreTrainedModel, Optional[PeftConfig]]

    from peft import LoraConfig, PeftModel, get_peft_model

    lora_target_modules = list(cfg.lora_target_modules or [])

    if cfg.lora_target_linear:
-        linear_names = find_all_linear_names(model)
+        bits = None
+        if cfg.load_in_4bit:
+            bits = 4
+        elif cfg.load_in_8bit:
+            bits = 8
+
+        linear_names = find_all_linear_names(bits, model)
        LOG.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))

@@ -447,11 +502,10 @@ def load_lora(model, cfg, inference=False):
    )

    if cfg.lora_model_dir:
-        LOG.debug("Loading pretained PEFT - LoRA")
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
-            is_trainable=(not inference),
+            is_trainable=not cfg.inference,
        )
    else:
        model = get_peft_model(model, lora_config)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -10,15 +10,17 @@ from functools import partial
 from pathlib import Path
 from typing import Optional, Union

+import bitsandbytes as bnb
 import numpy as np
 import torch.cuda
+import transformers
 from datasets import Dataset, set_caching_enabled
+from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
 from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
-from transformers.trainer_pt_utils import SequentialDistributedSampler
+from transformers.trainer_pt_utils import get_parameter_names

-from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
 from axolotl.utils.callbacks import (
    GPUStatsCallback,
    SaveBetterTransformerModelCallback,
@@ -26,7 +28,10 @@ from axolotl.utils.callbacks import (
 )
 from axolotl.utils.collators import DataCollatorForSeq2Seq
 from axolotl.utils.dataloader import MultipackDistributedDataloader
-from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
+from axolotl.utils.schedulers import (
+    InterpolatingLogScheduler,
+    get_cosine_schedule_with_quadratic_warmup,
+)

 LOG = logging.getLogger("axolotl")

@@ -119,14 +124,6 @@ class AxolotlTrainingArguments(TrainingArguments):
        default=1,
        metadata={"help": "the multiplier for the max len for packed sequences"},
    )
-    relora_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to reset for ReLoRA"},
-    )
-    relora_warmup_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
-    )


 class AxolotlTrainer(Trainer):
@@ -174,18 +171,6 @@ class AxolotlTrainer(Trainer):
            )
        return super()._get_train_sampler()

-    def _get_eval_sampler(
-        self, eval_dataset: Dataset
-    ) -> Optional[torch.utils.data.Sampler]:
-        if self.args.world_size > 1 and self.args.sample_packing:
-            return SequentialDistributedSampler(
-                eval_dataset,
-                num_replicas=self.args.world_size,
-                rank=self.args.process_index,
-                batch_size=self.args.per_device_eval_batch_size,
-            )
-        return super()._get_eval_sampler(eval_dataset)
-
    def get_train_dataloader(self) -> Union[DataLoader, MultipackDistributedDataloader]:
        if self.args.sample_packing:
            train_sampler = self._get_train_sampler()
@@ -210,7 +195,6 @@ class AxolotlTrainer(Trainer):
            eval_dataset = (
                eval_dataset if eval_dataset is not None else self.eval_dataset
            )
-
            eval_sampler = self._get_eval_sampler(eval_dataset)
            return self.accelerator.prepare(
                MultipackDistributedDataloader(
@@ -265,39 +249,6 @@ class OneCycleLRSchedulerTrainer(AxolotlTrainer):
        return self.lr_scheduler


-class ReLoRATrainer(AxolotlTrainer):
-    """
-    Trainer subclass that uses the OneCycleLR scheduler
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.lr_scheduler = None
-
-    def create_scheduler(
-        self,
-        num_training_steps: int,
-        optimizer: Optional[torch.optim.Optimizer] = None,
-    ):
-        optimizer = self.optimizer if optimizer is None else optimizer
-        lr_scheduler = super().create_scheduler(num_training_steps, optimizer)
-
-        if self.args.relora_steps:
-            warmup_steps = (
-                self.args.relora_warmup_steps if self.args.relora_warmup_steps else 10
-            )
-            self.lr_scheduler = ReLoRAScheduler(
-                optimizer,
-                lr_scheduler,
-                self.args.relora_steps,
-                warmup_steps,
-            )
-        else:
-            self.lr_scheduler = lr_scheduler
-
-        return self.lr_scheduler
-
-
 def add_position_ids(sample):
    sample["position_ids"] = torch.arange(len(sample["input_ids"]))
    return sample
@@ -317,15 +268,15 @@ def disable_datasets_caching():


 def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
-    drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
-    train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count())
-    if eval_dataset:
-        eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count())
-
    if cfg.sample_packing:
-        train_dataset = train_dataset.map(add_position_ids, num_proc=os.cpu_count())
+        drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
+        train_dataset = train_dataset.filter(drop_long, num_proc=os.cpu_count()).map(
+            add_position_ids, num_proc=os.cpu_count()
+        )
        if eval_dataset:
-            eval_dataset = eval_dataset.map(add_position_ids, num_proc=os.cpu_count())
+            eval_dataset = eval_dataset.filter(drop_long, num_proc=os.cpu_count()).map(
+                add_position_ids, num_proc=os.cpu_count()
+            )
    return train_dataset, eval_dataset


@@ -404,16 +355,10 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):

 def setup_fsdp_envs(cfg):
    os.environ["ACCELERATE_USE_FSDP"] = "true"
-    if cfg.fsdp_config.fsdp_offload_params:
-        os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
    if cfg.fsdp_config.fsdp_sync_module_states:
        os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
    if cfg.fsdp_config.fsdp_state_dict_type:
        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.fsdp_state_dict_type
-    if cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap:
-        os.environ[
-            "FSDP_TRANSFORMER_CLS_TO_WRAP"
-        ] = cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap


 def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
@@ -447,7 +392,23 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        training_arguments_kwargs["seed"] = cfg.seed

    if cfg.gradient_checkpointing:
-        training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing
+        if cfg.gptq:
+            from alpaca_lora_4bit.gradient_checkpointing import (
+                apply_gradient_checkpointing,
+            )
+
+            gradient_checkpointing_ratio = (
+                cfg.gradient_checkpointing_ratio
+                if cfg.gradient_checkpointing_ratio
+                else 1.0
+            )
+            apply_gradient_checkpointing(
+                model, checkpoint_ratio=gradient_checkpointing_ratio
+            )
+        else:
+            training_arguments_kwargs[
+                "gradient_checkpointing"
+            ] = cfg.gradient_checkpointing
    if cfg.fsdp:
        training_arguments_kwargs["fsdp"] = cfg.fsdp
        if cfg.fsdp_config:
@@ -494,13 +455,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        # we have an eval set, but no steps defined, use epoch
        training_arguments_kwargs["evaluation_strategy"] = "epoch"

-    if cfg.save_strategy:
-        training_arguments_kwargs["save_strategy"] = cfg.save_strategy
-    else:
-        training_arguments_kwargs["save_strategy"] = (
-            "steps" if cfg.save_steps else "epoch"
-        )
-
    training_args = AxolotlTrainingArguments(  # pylint: disable=unexpected-keyword-arg
        max_steps=total_num_steps if cfg.max_steps else -1,
        max_seq_length=cfg.sequence_len,
@@ -512,6 +466,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        eval_accumulation_steps=cfg.gradient_accumulation_steps,
        num_train_epochs=cfg.num_epochs,
        learning_rate=cfg.learning_rate,
+        save_strategy="steps" if cfg.save_steps else "epoch",
        save_steps=cfg.save_steps,
        output_dir=cfg.output_dir,
        save_total_limit=cfg.save_total_limit if cfg.save_total_limit else 4,
@@ -534,8 +489,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        weight_decay=cfg.weight_decay if cfg.weight_decay is not None else 0.0,
        sample_packing=cfg.sample_packing if cfg.sample_packing else False,
        sample_packing_seq_len_multiplier=cfg.micro_batch_size,
-        relora_steps=cfg.relora_steps,
-        relora_warmup_steps=cfg.relora_warmup_steps,
        **training_arguments_kwargs,
    )

@@ -545,13 +498,69 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        if Path(cfg.torchdistx_path).exists():
            sys.path.append(cfg.torchdistx_path)
            importlib.import_module("torchdistx")
+    if (
+        cfg.optimizer == "adamw_bnb_8bit"
+        and not cfg.gptq
+        and "deepspeed" not in training_arguments_kwargs
+        and not cfg.fsdp
+    ):
+        decay_parameters = get_parameter_names(model, [nn.LayerNorm])
+        decay_parameters = [name for name in decay_parameters if "bias" not in name]
+        optimizer_grouped_parameters = [
+            {
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if (n in decay_parameters and p.requires_grad)
+                ],
+                "weight_decay": training_args.weight_decay,
+            },
+            {
+                "params": [
+                    p
+                    for n, p in model.named_parameters()
+                    if (n not in decay_parameters and p.requires_grad)
+                ],
+                "weight_decay": 0.0,
+            },
+        ]
+
+        optimizer = bnb.optim.Adam8bit(
+            optimizer_grouped_parameters,
+            betas=(training_args.adam_beta1, training_args.adam_beta2),
+            eps=training_args.adam_epsilon,
+            lr=training_args.learning_rate,
+        )
+
+        if cfg.lr_scheduler == "one_cycle":
+            lr_scheduler_kwargs = (
+                cfg.lr_scheduler_kwargs if cfg.lr_scheduler_kwargs else {}
+            )
+            lr_scheduler = OneCycleLR(
+                optimizer,
+                cfg.learning_rate,
+                total_steps=total_num_steps,
+                epochs=cfg.num_epochs,
+                div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6,
+                **lr_scheduler_kwargs,
+            )
+        elif cfg.lr_scheduler == "log_sweep":
+            lr_scheduler = InterpolatingLogScheduler(
+                optimizer,
+                cfg.warmup_steps,
+                cfg.log_sweep_min_lr if cfg.log_sweep_min_lr else 1e-10,
+                cfg.log_sweep_max_lr if cfg.log_sweep_max_lr else 10,
+            )
+        else:
+            lr_scheduler = transformers.get_cosine_schedule_with_warmup(
+                optimizer,
+                training_args.warmup_steps,
+                total_num_steps,
+            )
+        trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)

    callbacks = []
    callbacks.append(GPUStatsCallback(cfg))
-
-    if cfg.relora_steps:
-        callbacks.append(ReLoRACallback(cfg))
-
    # TODO on_save callback to sync checkpoints to GCP/AWS in background
    if cfg.early_stopping_patience:
        early_stop_cb = EarlyStoppingCallback(
@@ -569,12 +578,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        callbacks.append(SaveBetterTransformerModelCallback)

    data_collator_kwargs = {
-        "padding": True,  # True/"longest" is the default
+        "padding": True,
    }
-    if cfg.pad_to_sequence_len:
-        data_collator_kwargs["pad_to_multiple_of"] = 64 * math.ceil(
-            cfg.sequence_len / 64
-        )
+    if cfg.collator_pad_to_longest:
+        data_collator_kwargs["padding"] = "longest"
    else:
        # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
        # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
@@ -598,11 +605,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
                num_proc=32,
            )

-    trainer_cls = AxolotlTrainer
-    if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora"):
-        trainer_cls = OneCycleLRSchedulerTrainer
-    elif cfg.relora_steps:
-        trainer_cls = ReLoRATrainer
+    trainer_cls = (
+        OneCycleLRSchedulerTrainer
+        if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
+        else AxolotlTrainer
+    )
    trainer = trainer_cls(
        model=model,
        train_dataset=train_dataset,
--- a/tests/fixtures/conversation.tokenized_llama2chat.json
+++ b/tests/fixtures/conversation.tokenized_llama2chat.json