From 1edc30c786794ba2d57976c417378a0d27ced6eb Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 27 May 2023 17:57:29 -0400
Subject: [PATCH 01/55] add support for opimum bettertransformers

---
 configs/gpt_neox_20b.yml        | 30 ++++++++++++++++++------------
 requirements.txt                |  1 +
 scripts/finetune.py             | 15 +++++++++++----
 src/axolotl/utils/models.py     |  8 ++++++--
 src/axolotl/utils/validation.py |  8 ++++++++
 5 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/configs/gpt_neox_20b.yml b/configs/gpt_neox_20b.yml
index 730afb72c..25fdae53b 100644
--- a/configs/gpt_neox_20b.yml
+++ b/configs/gpt_neox_20b.yml
@@ -1,24 +1,25 @@
 base_model: EleutherAI/gpt-neox-20b
+base_model_config: EleutherAI/gpt-neox-20b
 base_model_ignore_patterns: pytorch*  # prefer safetensors
 model_type: GPTNeoXForCausalLM
 tokenizer_type: AutoTokenizer
-load_in_8bit: true
+load_in_8bit: false
+load_in_4bit: true
+load_4bit: false
 datasets:
-  - path: nomic-ai/gpt4all-j-prompt-generations
+  - path: vicgalle/alpaca-gpt4
     type: alpaca
-    shards: 4
-    shards_index: 0
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
-adapter: lora
+adapter:
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 2048
-lora_r: 8
+lora_r: 64
 lora_alpha: 32
-lora_dropout: 0.05
+lora_dropout: 0.0
 lora_target_modules:
-  - query_key_value
+lora_target_linear: true
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
 wandb_project: gpt4all-neox-20b
 wandb_watch:
@@ -26,14 +27,19 @@ wandb_run_id:
 wandb_log_model:
 output_dir: ./gpt4all-neox-20b
 gradient_accumulation_steps: 1
-micro_batch_size: 4
+micro_batch_size: 2
 num_epochs: 5
 learning_rate: 0.00003
-lr_scheduler: one_cycle
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
 train_on_inputs: false
 group_by_length: false
-bf16: True
-tf32: True
+bf16: false
+fp16: false
+float16: true
+tf32: true
+flash_optimum: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
+gradient_checkpointing: true
diff --git a/requirements.txt b/requirements.txt
index c9123fce8..d1b2f4555 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ sentencepiece
 wandb
 einops
 xformers
+optimum
 # qlora things
 bert-score==0.3.13
 evaluate==0.4.0
diff --git a/scripts/finetune.py b/scripts/finetune.py
index fa2dcf903..a5b5e7c85 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -6,6 +6,7 @@ import os
 import random
 import signal
 import sys
+from functools import partial
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
@@ -19,6 +20,8 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_tokenizer
 
 # add src to the pythonpath so we don't need to pip install this
+from optimum.bettertransformer import BetterTransformer
+
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.validation import validate_config
@@ -264,12 +267,14 @@ def train(
 
     # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
     if cfg.local_rank == 0:
+        def terminate_handler(signum, frame, model):
+            if cfg.flash_optimum:
+                model = BetterTransformer.reverse(model)
+            model.save_pretrained(cfg.output_dir)
+            sys.exit(0)
         signal.signal(
             signal.SIGINT,
-            lambda signal, frame: (
-                model.save_pretrained(cfg.output_dir),
-                sys.exit(0),
-            ),
+            lambda signum, frame: terminate_handler(signum, frame, model)
         )
 
     logging.info("Starting trainer...")
@@ -299,6 +304,8 @@ def train(
     # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
     # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
     if cfg.local_rank == 0:
+        if cfg.flash_optimum:
+            model = BetterTransformer.reverse(model)
         model.save_pretrained(cfg.output_dir)
 
     # trainer.save_model(cfg.output_dir)  # TODO this may be needed for deepspeed to work? need to review another time
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 1acaf6ab3..11b4629ec 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -11,7 +11,8 @@ import bitsandbytes as bnb
 import torch
 import transformers
 from transformers import PreTrainedModel  # noqa: F401
-from transformers import (  # noqa: F401
+from optimum.bettertransformer import BetterTransformer
+from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
@@ -137,7 +138,7 @@ def load_model(
 
     if cfg.bf16:
         torch_dtype = torch.bfloat16
-    elif cfg.load_in_8bit or cfg.fp16:
+    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
         torch_dtype = torch.float16
     else:
         torch_dtype = torch.float32
@@ -342,6 +343,9 @@ def load_model(
         logging.warning("there are no parameters that require gradient updates")
     model.config.use_cache = False
 
+    if cfg.flash_optimum:
+        model = BetterTransformer.transform(model)
+
     # TODO resume_from_checkpoint handling
     return model, lora_config
 
diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index 04ffc4c1b..ba5feafe8 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -57,6 +57,14 @@ def validate_config(cfg):
     if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
         raise ValueError("FSDP is not supported for falcon models")
 
+    if cfg.flash_optimum is True:
+        if cfg.adapter:
+            logging.warning("BetterTransformers probably doesn't work with PEFT adapters")
+        if cfg.fp16 or cfg.bf16:
+            raise ValueError("AMP is not supported with BetterTransformer")
+        if cfg.float16 is not True:
+            logging.warning("You should probably set float16 to true")
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25

From 879219979955fa2c3a2394578a8886f77e687594 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 27 May 2023 18:12:12 -0400
Subject: [PATCH 02/55] add flash attn context for efficient training and
 attempt setting model to train mode:

---
 scripts/finetune.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index a5b5e7c85..99236b087 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -252,6 +252,24 @@ def train(
         model.save_pretrained(cfg.output_dir)
         return
 
+    if cfg.debug:
+        logging.info("check_dataset_labels...")
+        check_dataset_labels(
+            train_dataset.select(
+                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
+            ),
+            tokenizer,
+        )
+
+    if prepare_ds_only:
+        logging.info("Finished preparing dataset. Exiting...")
+        return
+
+    try:
+        model.train()
+    except:
+        pass
+
     trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
 
     model.config.use_cache = False
@@ -297,7 +315,11 @@ def train(
 
     if not Path(cfg.output_dir).is_dir():
         os.makedirs(cfg.output_dir, exist_ok=True)
-    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    if cfg.flash_optimum:
+        with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
+            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
+    else:
+        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
 
     logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
 

From 39619028a37f4af77dd0b89c9b8191c783d7049a Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 27 May 2023 19:37:24 -0400
Subject: [PATCH 03/55] use pythia-12b, neox-20b is flaky

---
 examples/pythia-12b/README.md                 | 10 ++++++++++
 .../pythia-12b/config.yml                     | 20 +++++++++++--------
 2 files changed, 22 insertions(+), 8 deletions(-)
 create mode 100644 examples/pythia-12b/README.md
 rename configs/gpt_neox_20b.yml => examples/pythia-12b/config.yml (72%)

diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md
new file mode 100644
index 000000000..0953caa4e
--- /dev/null
+++ b/examples/pythia-12b/README.md
@@ -0,0 +1,10 @@
+# Python 12B
+
+- Single-GPU A100 only (?)
+
+```shell
+python scripts/finetune.py examples/pythia-12b/config.yml
+```
+
+⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️
+
diff --git a/configs/gpt_neox_20b.yml b/examples/pythia-12b/config.yml
similarity index 72%
rename from configs/gpt_neox_20b.yml
rename to examples/pythia-12b/config.yml
index 25fdae53b..28e822c77 100644
--- a/configs/gpt_neox_20b.yml
+++ b/examples/pythia-12b/config.yml
@@ -1,11 +1,12 @@
-base_model: EleutherAI/gpt-neox-20b
-base_model_config: EleutherAI/gpt-neox-20b
+base_model: EleutherAI/pythia-12b-deduped
+base_model_config: EleutherAI/pythia-12b-deduped
 base_model_ignore_patterns: pytorch*  # prefer safetensors
 model_type: GPTNeoXForCausalLM
 tokenizer_type: AutoTokenizer
 load_in_8bit: false
-load_in_4bit: true
-load_4bit: false
+load_in_4bit: false
+gptq: false
+device_map: auto
 datasets:
   - path: vicgalle/alpaca-gpt4
     type: alpaca
@@ -21,16 +22,16 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: gpt4all-neox-20b
+wandb_project: pythia-12b
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
-output_dir: ./gpt4all-neox-20b
+output_dir: ./pythia-12b
 gradient_accumulation_steps: 1
-micro_batch_size: 2
+micro_batch_size: 1
 num_epochs: 5
 learning_rate: 0.00003
-optimizer: paged_adamw_32bit
+optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 train_on_inputs: false
 group_by_length: false
@@ -43,3 +44,6 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 gradient_checkpointing: true
+fsdp:
+fsdp_transformer_layer_cls_to_wrap:
+collator_pad_to_longest: true

From 71a43f8479a1cef0247ceb2cc00c7c1a048ed863 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 28 May 2023 08:56:08 -0400
Subject: [PATCH 04/55] add validation/warning for bettertransformers and torch
 version

---
 src/axolotl/utils/validation.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index ba5feafe8..db19900cc 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -1,7 +1,7 @@
 """Module for validating config files"""
 
 import logging
-
+import torch
 
 def validate_config(cfg):
     if cfg.gradient_accumulation_steps and cfg.batch_size:
@@ -63,7 +63,10 @@ def validate_config(cfg):
         if cfg.fp16 or cfg.bf16:
             raise ValueError("AMP is not supported with BetterTransformer")
         if cfg.float16 is not True:
-            logging.warning("You should probably set float16 to true")
+            logging.warning("You should probably set float16 to true to load the model in float16 for BetterTransformers")
+        if torch.__version__.split(".")[0] < 2:
+            logging.warning("torch>=2.0.0 required")
+            raise ValueError(f"flash_optimum for BetterTransformers may not be used with {torch.__version__}")
 
     # TODO
     # MPT 7b

From 488a67d75a4a6ccf7ed0862bbe913a356a473b0d Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 31 May 2023 16:51:19 -0400
Subject: [PATCH 05/55] experimental expansion of ctx len

---
 scripts/finetune.py       | 44 +++++++++++++++++++++++----------------
 src/axolotl/utils/data.py | 32 +++++++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 99236b087..88815dfdd 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -6,22 +6,20 @@ import os
 import random
 import signal
 import sys
-from functools import partial
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
 import fire
 import torch
 import yaml
-from transformers import GenerationConfig, TextStreamer
-
-from axolotl.utils.data import load_prepare_datasets
-from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import load_model, load_tokenizer
 
 # add src to the pythonpath so we don't need to pip install this
 from optimum.bettertransformer import BetterTransformer
+from transformers import GenerationConfig, TextStreamer
 
+from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_model, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.validation import validate_config
@@ -204,9 +202,19 @@ def train(
     if check_not_in(
         ["inference", "shard", "merge_lora"], kwargs
     ):  # don't need to load dataset for these
-        train_dataset, eval_dataset = load_prepare_datasets(
-            tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
-        )
+        if not cfg.pretraining_dataset:
+            train_dataset, eval_dataset = load_prepare_datasets(
+                tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
+            )
+        else:
+            if cfg.pretraining_dataset is True:
+                pretraining_dataset = "togethercomputer/RedPajama-Data-1T"
+            else:
+                pretraining_dataset = cfg.pretraining_dataset
+            train_dataset = load_pretraining_dataset(
+                pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len
+            )
+            eval_dataset = None
 
     if cfg.debug or "debug" in kwargs:
         logging.info("check_dataset_labels...")
@@ -256,7 +264,7 @@ def train(
         logging.info("check_dataset_labels...")
         check_dataset_labels(
             train_dataset.select(
-                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]
+                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]  # nosec
             ),
             tokenizer,
         )
@@ -265,10 +273,7 @@ def train(
         logging.info("Finished preparing dataset. Exiting...")
         return
 
-    try:
-        model.train()
-    except:
-        pass
+    model.train()
 
     trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
 
@@ -285,14 +290,15 @@ def train(
 
     # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
     if cfg.local_rank == 0:
-        def terminate_handler(signum, frame, model):
+
+        def terminate_handler(_, __, model):
             if cfg.flash_optimum:
                 model = BetterTransformer.reverse(model)
             model.save_pretrained(cfg.output_dir)
             sys.exit(0)
+
         signal.signal(
-            signal.SIGINT,
-            lambda signum, frame: terminate_handler(signum, frame, model)
+            signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
         )
 
     logging.info("Starting trainer...")
@@ -316,7 +322,9 @@ def train(
     if not Path(cfg.output_dir).is_dir():
         os.makedirs(cfg.output_dir, exist_ok=True)
     if cfg.flash_optimum:
-        with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
+        with torch.backends.cuda.sdp_kernel(
+            enable_flash=True, enable_math=True, enable_mem_efficient=True
+        ):
             trainer.train(resume_from_checkpoint=resume_from_checkpoint)
     else:
         trainer.train(resume_from_checkpoint=resume_from_checkpoint)
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index cba964076..49314372a 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -5,7 +5,8 @@ from hashlib import md5
 from pathlib import Path
 from typing import List, Tuple, Union
 
-from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
+import torch
+from datasets import Dataset, DatasetDict, IterableDataset, load_dataset, load_from_disk
 from huggingface_hub import hf_hub_download
 from transformers import PreTrainedTokenizerBase
 
@@ -392,3 +393,32 @@ def load_prepare_datasets(
     eval_dataset = dataset["test"]
 
     return train_dataset, eval_dataset
+
+
+class PretrainingDatasetWrapper(IterableDataset):
+    """
+    Wrapper for pretraining dataset that avoids loading the dataset into memory
+    """
+
+    def __init__(self, tokenizer, dataset_path, max_tokens=2048):
+        self.tokenizer = tokenizer
+        self.dataset_path = dataset_path
+        self.max_tokens = max_tokens
+
+    def __iter__(self):
+        buffer = []
+        for sample in load_dataset(
+            self.dataset_path,
+            name="all",
+            split="train",
+            streaming=True,
+        ).shuffle(buffer_size=10000):
+            buffer += self.tokenizer(sample["text"])["input_ids"]
+            buffer += [self.tokenizer.eos_token_id]
+            while len(buffer) > self.max_tokens:
+                yield torch.tensor(buffer[: self.max_tokens])
+                buffer = buffer[self.max_tokens :]
+
+
+def load_pretraining_dataset(path, tokenizer, max_tokens=2048):
+    return PretrainingDatasetWrapper(tokenizer, path, max_tokens=max_tokens)

From 1210dc8fd5c494face7165338f1ed9f2981a2245 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 31 May 2023 21:59:15 -0400
Subject: [PATCH 06/55] more tweaks to do pre-training with bettertransformers

---
 scripts/finetune.py             |  2 ++
 src/axolotl/utils/callbacks.py  | 24 ++++++++++++++++++++++++
 src/axolotl/utils/data.py       | 12 +++++++-----
 src/axolotl/utils/models.py     |  4 ++--
 src/axolotl/utils/trainer.py    |  8 +++++++-
 src/axolotl/utils/validation.py | 16 ++++++++++++----
 6 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 88815dfdd..9bed61ca4 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -14,6 +14,7 @@ import torch
 import yaml
 
 # add src to the pythonpath so we don't need to pip install this
+from datasets import Dataset
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer
 
@@ -214,6 +215,7 @@ def train(
             train_dataset = load_pretraining_dataset(
                 pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len
             )
+            train_dataset = Dataset.from_list(list(train_dataset))
             eval_dataset = None
 
     if cfg.debug or "debug" in kwargs:
diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index f6852249a..ab197304c 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -2,6 +2,7 @@
 
 import os
 
+from optimum.bettertransformer import BetterTransformer
 from transformers import (
     TrainerCallback,
     TrainerControl,
@@ -30,3 +31,26 @@ class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-
         kwargs["model"].save_pretrained(peft_model_path)
 
         return control
+
+
+class SaveBetterTransformerModelCallback(
+    TrainerCallback
+):  # pylint: disable=too-few-public-methods
+    """Callback to save the BatterTransformer wrapped model"""
+
+    def on_save(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        checkpoint_folder = os.path.join(
+            args.output_dir,
+            f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+        )
+
+        model = BetterTransformer.reverse(kwargs["model"])
+        model.save_pretrained(checkpoint_folder)
+
+        return control
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 49314372a..164296ee2 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -409,14 +409,16 @@ class PretrainingDatasetWrapper(IterableDataset):
         buffer = []
         for sample in load_dataset(
             self.dataset_path,
-            name="all",
-            split="train",
-            streaming=True,
-        ).shuffle(buffer_size=10000):
+        )["train"].shuffle():
             buffer += self.tokenizer(sample["text"])["input_ids"]
             buffer += [self.tokenizer.eos_token_id]
             while len(buffer) > self.max_tokens:
-                yield torch.tensor(buffer[: self.max_tokens])
+                input_ids = torch.tensor(buffer[: self.max_tokens])
+                yield {
+                    "input_ids": input_ids,
+                    "attention_mask": torch.ones(input_ids.size()),
+                    "labels": input_ids,
+                }
                 buffer = buffer[self.max_tokens :]
 
 
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 11b4629ec..91ef96ca9 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -10,8 +10,8 @@ from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401
 import bitsandbytes as bnb
 import torch
 import transformers
-from transformers import PreTrainedModel  # noqa: F401
 from optimum.bettertransformer import BetterTransformer
+from transformers import PreTrainedModel  # noqa: F401
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
@@ -136,7 +136,7 @@ def load_model(
         logging.info("patching with xpos rope")
         replace_llama_rope_with_xpos_rope()
 
-    if cfg.bf16:
+    if cfg.bf16 or cfg.bfloat16:
         torch_dtype = torch.bfloat16
     elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
         torch_dtype = torch.float16
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 9ae1e7e93..b7823fea4 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -16,7 +16,10 @@ from torch.optim.lr_scheduler import OneCycleLR
 from transformers import EarlyStoppingCallback, Trainer
 from transformers.trainer_pt_utils import get_parameter_names
 
-from axolotl.utils.callbacks import SavePeftModelCallback
+from axolotl.utils.callbacks import (
+    SaveBetterTransformerModelCallback,
+    SavePeftModelCallback,
+)
 from axolotl.utils.schedulers import InterpolatingLogScheduler
 
 
@@ -228,6 +231,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
     ]:  # only save in rank 0
         callbacks.append(SavePeftModelCallback)
 
+    if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
+        callbacks.append(SaveBetterTransformerModelCallback)
+
     data_collator_kwargs = {
         "padding": True,
     }
diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index db19900cc..abaaba8d0 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -1,8 +1,10 @@
 """Module for validating config files"""
 
 import logging
+
 import torch
 
+
 def validate_config(cfg):
     if cfg.gradient_accumulation_steps and cfg.batch_size:
         raise ValueError(
@@ -59,14 +61,20 @@ def validate_config(cfg):
 
     if cfg.flash_optimum is True:
         if cfg.adapter:
-            logging.warning("BetterTransformers probably doesn't work with PEFT adapters")
+            logging.warning(
+                "BetterTransformers probably doesn't work with PEFT adapters"
+            )
         if cfg.fp16 or cfg.bf16:
             raise ValueError("AMP is not supported with BetterTransformer")
         if cfg.float16 is not True:
-            logging.warning("You should probably set float16 to true to load the model in float16 for BetterTransformers")
-        if torch.__version__.split(".")[0] < 2:
+            logging.warning(
+                "You should probably set float16 to true to load the model in float16 for BetterTransformers"
+            )
+        if int(torch.__version__.split(".")[0]) < 2:
             logging.warning("torch>=2.0.0 required")
-            raise ValueError(f"flash_optimum for BetterTransformers may not be used with {torch.__version__}")
+            raise ValueError(
+                f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
+            )
 
     # TODO
     # MPT 7b

From 1a82082e91127fedae540cfbc9e68ce2b3ef08a4 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 1 Jun 2023 00:33:13 -0400
Subject: [PATCH 07/55] fix bettertransformers save, force it to skip after
 saving correctly in callback

---
 src/axolotl/utils/callbacks.py  | 30 +++++++++++++++++++++---------
 src/axolotl/utils/trainer.py    |  1 +
 src/axolotl/utils/validation.py |  5 +++--
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index ab197304c..64bf48664 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -9,7 +9,7 @@ from transformers import (
     TrainerState,
     TrainingArguments,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
 
 
 class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-methods
@@ -36,21 +36,33 @@ class SavePeftModelCallback(TrainerCallback):  # pylint: disable=too-few-public-
 class SaveBetterTransformerModelCallback(
     TrainerCallback
 ):  # pylint: disable=too-few-public-methods
-    """Callback to save the BatterTransformer wrapped model"""
+    """Callback to save the BetterTransformer wrapped model"""
 
-    def on_save(
+    def on_step_end(
         self,
         args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
         **kwargs,
     ):
-        checkpoint_folder = os.path.join(
-            args.output_dir,
-            f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
-        )
+        # Save
+        if (
+            args.save_strategy == IntervalStrategy.STEPS
+            and args.save_steps > 0
+            and state.global_step % args.save_steps == 0
+        ):
+            control.should_save = True
 
-        model = BetterTransformer.reverse(kwargs["model"])
-        model.save_pretrained(checkpoint_folder)
+        if control.should_save:
+            checkpoint_folder = os.path.join(
+                args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+            )
 
+            model = BetterTransformer.reverse(kwargs["model"])
+            model.save_pretrained(checkpoint_folder)
+
+            # since we're saving here, we don't need the trainer loop to attempt to save too b/c
+            # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
+            control.should_save = False
         return control
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index b7823fea4..59b1dc803 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -232,6 +232,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
         callbacks.append(SavePeftModelCallback)
 
     if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
+        logging.info("Setting up SaveBetterTransformerModelCallback.")
         callbacks.append(SaveBetterTransformerModelCallback)
 
     data_collator_kwargs = {
diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index abaaba8d0..396036621 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -66,9 +66,10 @@ def validate_config(cfg):
             )
         if cfg.fp16 or cfg.bf16:
             raise ValueError("AMP is not supported with BetterTransformer")
-        if cfg.float16 is not True:
+        if cfg.float16 is not True and cfg.bloat16 is not True:
             logging.warning(
-                "You should probably set float16 to true to load the model in float16 for BetterTransformers"
+                "You should probably set bfloat16 or float16 to true to "
+                "load the model in float16 for BetterTransformers"
             )
         if int(torch.__version__.split(".")[0]) < 2:
             logging.warning("torch>=2.0.0 required")

From ab5cd28acfd12304201c4c184aa03a5ac3885ce2 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 1 Jun 2023 08:20:08 -0400
Subject: [PATCH 08/55] more gpt-neox long ctx fixes

---
 src/axolotl/utils/callbacks.py  |  1 +
 src/axolotl/utils/data.py       | 10 +++++++---
 src/axolotl/utils/models.py     |  6 ++++++
 src/axolotl/utils/validation.py |  9 ++++++++-
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py
index 64bf48664..526121f2e 100644
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -61,6 +61,7 @@ class SaveBetterTransformerModelCallback(
 
             model = BetterTransformer.reverse(kwargs["model"])
             model.save_pretrained(checkpoint_folder)
+            # FIXME - need to cleanup old checkpoints
 
             # since we're saving here, we don't need the trainer loop to attempt to save too b/c
             # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 164296ee2..13ad7c75d 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -388,9 +388,13 @@ def load_prepare_datasets(
             index=cfg.dataset_shard_idx,
         )
 
-    dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
-    train_dataset = dataset["train"]
-    eval_dataset = dataset["test"]
+    if cfg.val_set_size:
+        dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
+        train_dataset = dataset["train"]
+        eval_dataset = dataset["test"]
+    else:
+        train_dataset = dataset
+        eval_dataset = None
 
     return train_dataset, eval_dataset
 
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 91ef96ca9..49a9b6f85 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -300,6 +300,12 @@ def load_model(
     embeddings_len = math.ceil(len(tokenizer) / 32) * 32
     model.resize_token_embeddings(embeddings_len)
 
+    if cfg.sequence_len >= model.config.max_position_embeddings:
+        logging.warning(
+            f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
+        )
+        model.config.max_position_embeddings = cfg.sequence_len
+
     if not cfg.gptq and (
         (cfg.adapter == "lora" and load_in_8bit)
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)
diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index 396036621..2e2450fba 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -80,4 +80,11 @@ def validate_config(cfg):
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25
-    # no 8bit adamw w bf16
+    # no 8bit adaAmw w bf16
+
+    # GPT-NeoX
+    # evals broken when extending context len
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward                        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+    # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product
+    # attention_mask = causal_mask + attention_mask
+    # RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3

From 1db46a9c720d60113ff2828ab6de219e1b857c79 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 8 Jun 2023 22:05:06 -0400
Subject: [PATCH 09/55] linting fix

---
 examples/pythia-12b/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md
index 0953caa4e..d28d5e77d 100644
--- a/examples/pythia-12b/README.md
+++ b/examples/pythia-12b/README.md
@@ -7,4 +7,3 @@ python scripts/finetune.py examples/pythia-12b/config.yml
 ```
 
 ⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️
-

From eea2731a5ebc113e769aa2a57af9b96effed2053 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 9 Jun 2023 20:25:38 -0400
Subject: [PATCH 10/55] add streaming dataset support for pretraining datasets

---
 README.md                       |   2 +
 scripts/finetune.py             |  23 +-----
 src/axolotl/utils/data.py       | 136 ++++++++++++++++++++++++++------
 src/axolotl/utils/validation.py |   5 ++
 tests/test_validation.py        |  51 ++++++++++++
 5 files changed, 171 insertions(+), 46 deletions(-)

diff --git a/README.md b/README.md
index de929f237..2bc55732d 100644
--- a/README.md
+++ b/README.md
@@ -410,6 +410,8 @@ optimizer:
 # specify weight decay
 weight_decay:
 
+# whether to bettertransformers
+flash_optimum:
 # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
 # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
diff --git a/scripts/finetune.py b/scripts/finetune.py
index 9bed61ca4..ab226f68f 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -14,7 +14,6 @@ import torch
 import yaml
 
 # add src to the pythonpath so we don't need to pip install this
-from datasets import Dataset
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer
 
@@ -208,14 +207,11 @@ def train(
                 tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
             )
         else:
-            if cfg.pretraining_dataset is True:
-                pretraining_dataset = "togethercomputer/RedPajama-Data-1T"
-            else:
-                pretraining_dataset = cfg.pretraining_dataset
             train_dataset = load_pretraining_dataset(
-                pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len
+                cfg.pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len
             )
-            train_dataset = Dataset.from_list(list(train_dataset))
+            # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
+            train_dataset = train_dataset.with_format("torch")
             eval_dataset = None
 
     if cfg.debug or "debug" in kwargs:
@@ -262,19 +258,6 @@ def train(
         model.save_pretrained(cfg.output_dir)
         return
 
-    if cfg.debug:
-        logging.info("check_dataset_labels...")
-        check_dataset_labels(
-            train_dataset.select(
-                [random.randrange(0, len(train_dataset) - 1) for i in range(5)]  # nosec
-            ),
-            tokenizer,
-        )
-
-    if prepare_ds_only:
-        logging.info("Finished preparing dataset. Exiting...")
-        return
-
     model.train()
 
     trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 13ad7c75d..492d8059b 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -1,12 +1,12 @@
 """Module containing data utilities"""
-
+import functools
 import logging
 from hashlib import md5
 from pathlib import Path
 from typing import List, Tuple, Union
 
 import torch
-from datasets import Dataset, DatasetDict, IterableDataset, load_dataset, load_from_disk
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 from huggingface_hub import hf_hub_download
 from transformers import PreTrainedTokenizerBase
 
@@ -399,32 +399,116 @@ def load_prepare_datasets(
     return train_dataset, eval_dataset
 
 
-class PretrainingDatasetWrapper(IterableDataset):
-    """
-    Wrapper for pretraining dataset that avoids loading the dataset into memory
-    """
+def encode_pretraining(tokenizer, max_tokens, examples):
+    res = tokenizer(
+        examples["text"],
+        truncation=True,
+        max_length=max_tokens - 2,
+        add_special_tokens=True,
+    )
+    # Convert to PyTorch tensors
+    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
+    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
+    new_input_ids = []
+    new_attention_mask = []
+    # Append EOS and PAD tokens to input_ids, and correct attention_mask
+    for i, _ in enumerate(input_ids):
+        input_ids[i] = torch.cat(
+            (
+                input_ids[i],
+                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
+            ),
+            dim=0,
+        )
+        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
 
-    def __init__(self, tokenizer, dataset_path, max_tokens=2048):
-        self.tokenizer = tokenizer
-        self.dataset_path = dataset_path
-        self.max_tokens = max_tokens
+    # Concatenate tokens so that their lengths are less than max_tokens
+    buffer_input_ids = torch.tensor([], dtype=torch.long)
+    buffer_attention_mask = torch.tensor([], dtype=torch.long)
 
-    def __iter__(self):
-        buffer = []
-        for sample in load_dataset(
-            self.dataset_path,
-        )["train"].shuffle():
-            buffer += self.tokenizer(sample["text"])["input_ids"]
-            buffer += [self.tokenizer.eos_token_id]
-            while len(buffer) > self.max_tokens:
-                input_ids = torch.tensor(buffer[: self.max_tokens])
-                yield {
-                    "input_ids": input_ids,
-                    "attention_mask": torch.ones(input_ids.size()),
-                    "labels": input_ids,
-                }
-                buffer = buffer[self.max_tokens :]
+    for ids, mask in zip(input_ids, attention_mask):
+        if buffer_input_ids.numel() == max_tokens:
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+        else:
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            new_input_ids.append(buffer_input_ids)
+            new_attention_mask.append(buffer_attention_mask)
+            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_attention_mask = torch.tensor([], dtype=torch.long)
+
+            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
+
+    if buffer_input_ids.numel() > 0:  # for any leftover tokens
+        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
+            buffer_input_ids = torch.cat(
+                (
+                    buffer_input_ids,
+                    torch.full(
+                        (max_tokens - buffer_input_ids.numel(),),
+                        tokenizer.pad_token_id,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+            buffer_attention_mask = torch.cat(
+                (
+                    buffer_attention_mask,
+                    torch.full(
+                        (max_tokens - buffer_attention_mask.numel(),),
+                        0,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
+        new_input_ids.append(buffer_input_ids)
+        new_attention_mask.append(buffer_attention_mask)
+
+    ret = {
+        "input_ids": [seq.tolist() for seq in new_input_ids],
+        "labels": [seq.tolist() for seq in new_input_ids],
+        "attention_mask": [seq.tolist() for seq in new_attention_mask],
+    }
+
+    logging.debug(len(ret["input_ids"]))
+    return ret
 
 
 def load_pretraining_dataset(path, tokenizer, max_tokens=2048):
-    return PretrainingDatasetWrapper(tokenizer, path, max_tokens=max_tokens)
+    encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
+    dataset = load_dataset(path, streaming=True, split="train")
+    dataset = dataset.shuffle(seed=42, buffer_size=10_000)
+    # TODO dynamically figure out which columns/features to remove
+    dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"])
+    return dataset
diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index 2e2450fba..603afbfee 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -77,6 +77,11 @@ def validate_config(cfg):
                 f"flash_optimum for BetterTransformers may not be used with {torch.__version__}"
             )
 
+    if cfg.pretraining_dataset and cfg.group_by_length:
+        logging.warning(
+            "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
+        )
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25
diff --git a/tests/test_validation.py b/tests/test_validation.py
index 50bdf37e6..575392ab4 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -198,3 +198,54 @@ class ValidationTest(unittest.TestCase):
         )
 
         validate_config(cfg)
+
+    def test_flash_optimum(self):
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+                "adapter": "lora",
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "BetterTransformers probably doesn't work with PEFT adapters"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "probably set bfloat16 or float16" in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+                "fp16": True,
+            }
+        )
+        regex_exp = r".*AMP is not supported.*"
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "flash_optimum": True,
+                "bf16": True,
+            }
+        )
+        regex_exp = r".*AMP is not supported.*"
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)

From 0c6f928601ac289f7d4b513855feab5047cd7a5a Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 10 Jun 2023 14:21:43 -0400
Subject: [PATCH 11/55] address PR feedback

---
 examples/pythia-12b/README.md  | 2 +-
 examples/pythia-12b/config.yml | 4 ++--
 scripts/finetune.py            | 5 ++++-
 src/axolotl/utils/data.py      | 4 ++--
 src/axolotl/utils/trainer.py   | 2 --
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md
index d28d5e77d..123ffa710 100644
--- a/examples/pythia-12b/README.md
+++ b/examples/pythia-12b/README.md
@@ -1,4 +1,4 @@
-# Python 12B
+# Pythia 12B
 
 - Single-GPU A100 only (?)
 
diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml
index 28e822c77..3b3d91630 100644
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -22,7 +22,7 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: pythia-12b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
@@ -45,5 +45,5 @@ resume_from_checkpoint:
 local_rank:
 gradient_checkpointing: true
 fsdp:
-fsdp_transformer_layer_cls_to_wrap:
+fsdp_config:
 collator_pad_to_longest: true
diff --git a/scripts/finetune.py b/scripts/finetune.py
index ab226f68f..47aada411 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -208,7 +208,10 @@ def train(
             )
         else:
             train_dataset = load_pretraining_dataset(
-                cfg.pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len
+                cfg.pretraining_dataset,
+                tokenizer,
+                max_tokens=cfg.sequence_len,
+                seed=cfg.seed,
             )
             # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
             train_dataset = train_dataset.with_format("torch")
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 492d8059b..058c24bcd 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -505,10 +505,10 @@ def encode_pretraining(tokenizer, max_tokens, examples):
     return ret
 
 
-def load_pretraining_dataset(path, tokenizer, max_tokens=2048):
+def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
     encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
     dataset = load_dataset(path, streaming=True, split="train")
-    dataset = dataset.shuffle(seed=42, buffer_size=10_000)
+    dataset = dataset.shuffle(seed=seed, buffer_size=10_000)
     # TODO dynamically figure out which columns/features to remove
     dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"])
     return dataset
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 59b1dc803..57a08aa53 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,7 +1,6 @@
 """Module containing the Trainer class and related functions"""
 
 import importlib
-import logging
 import math
 import os
 import sys
@@ -232,7 +231,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
         callbacks.append(SavePeftModelCallback)
 
     if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
-        logging.info("Setting up SaveBetterTransformerModelCallback.")
         callbacks.append(SaveBetterTransformerModelCallback)
 
     data_collator_kwargs = {

From 759e8673ce497125da5855a173fd80f57bb071b3 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 10 Jun 2023 14:25:21 -0400
Subject: [PATCH 12/55] Update scripts/finetune.py

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
---
 scripts/finetune.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 47aada411..cd9234334 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -261,7 +261,6 @@ def train(
         model.save_pretrained(cfg.output_dir)
         return
 
-    model.train()
 
     trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
 

From 958da703762b7759eabdaa6fd7fad231228e1ad9 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 10 Jun 2023 15:28:08 -0400
Subject: [PATCH 13/55] fix formatting

---
 scripts/finetune.py          | 1 -
 src/axolotl/utils/trainer.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index cd9234334..2f6bef3ef 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -261,7 +261,6 @@ def train(
         model.save_pretrained(cfg.output_dir)
         return
 
-
     trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
 
     model.config.use_cache = False
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 57a08aa53..b7823fea4 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -1,6 +1,7 @@
 """Module containing the Trainer class and related functions"""
 
 import importlib
+import logging
 import math
 import os
 import sys

From c9a149f9e8bacdcd59a9e6de435499b2f4a845c1 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 10:11:17 -0400
Subject: [PATCH 14/55] add check for attr

---
 src/axolotl/utils/models.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 49a9b6f85..532fa5518 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -300,7 +300,10 @@ def load_model(
     embeddings_len = math.ceil(len(tokenizer) / 32) * 32
     model.resize_token_embeddings(embeddings_len)
 
-    if cfg.sequence_len >= model.config.max_position_embeddings:
+    if (
+        hasattr(model.config, "max_position_embeddings")
+        and cfg.sequence_len >= model.config.max_position_embeddings
+    ):
         logging.warning(
             f"increasing model.config.max_position_embeddings to {cfg.sequence_len}"
         )

From 2ba4ae8f461c0c491f9ca303c134f9ad6f725e8c Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 12 Jun 2023 10:07:18 -0400
Subject: [PATCH 15/55] tweak config to work

---
 examples/openllama-3b/config.yml | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml
index 6fd704ffc..4372876eb 100644
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -26,17 +26,18 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./openllama-out
-batch_size: 16
-micro_batch_size: 4
+gradient_accumulation_steps: 1
+micro_batch_size: 1
 num_epochs: 3
 optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0002
+learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
+float16: true
 bf16: false
-fp16: true
+fp16: false
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
@@ -52,7 +53,7 @@ eval_steps: 50
 save_steps:
 debug:
 deepspeed:
-weight_decay: 0.0
+weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:

From 34ae69989f1ce1cf4fdf53f0b55c537927dc4b9a Mon Sep 17 00:00:00 2001
From: mhenrichsen <mads.gade.henrichsen@live.dk>
Date: Mon, 12 Jun 2023 21:39:19 +0200
Subject: [PATCH 16/55] fix inference

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 214bfd14d..3126d81f3 100644
--- a/README.md
+++ b/README.md
@@ -500,16 +500,16 @@ Pass the appropriate flag to the train command:
 
 - Pretrained LORA:
   ```bash
-  --inference --lora_model_dir ./completed-model
+  --inference --lora_model_dir="./lora-output-dir"
   ```
 - Full weights finetune:
   ```bash
-  --inference --base_model ./completed-model
+  --inference --base_model="./completed-model"
   ```
 - Full weights finetune w/ a prompt from a text file:
   ```bash
   cat /tmp/prompt.txt | python scripts/finetune.py configs/your_config.yml \
-    --base_model ./completed-model --inference --prompter=None --load_in_8bit=True
+    --base_model="./completed-model" --inference --prompter=None --load_in_8bit=True
   ```
 
 ### Merge LORA to base

From 4b43a66a0b2e902ecfa49ab932e8df292e5e53dd Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 12 Jun 2023 18:38:38 -0400
Subject: [PATCH 17/55] update alpaca_chat prompts for instructions to explainn
 the conversation

---
 src/axolotl/prompt_strategies/alpaca_chat.py | 23 +++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py
index 0f8c31d6a..1183c1e8e 100644
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -20,11 +20,24 @@ def load(tokenizer, cfg):
 
 class AlpacaConcisePrompter(AlpacaPrompter):
     """
-    Alpaca Prompter extending the system prompt to ask for concise answers
+    Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers
     """
 
-    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that concisely and appropriately completes the request.\n\n"
-    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately and concisely completes the request.\n\n"
+    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
+    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"
+
+
+class AlpacaChatPrompter(AlpacaPrompter):
+    """
+    Alpaca Chat Prompter extending the system prompt to for chat-instruct answers
+    """
+
+    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
+    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"
+
+    def __init__(self):  # pylint: disable=super-init-not-called
+        self.prompt_style = PromptStyle.CHAT.value
+        self.match_prompt_style()
 
 
 class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
@@ -64,7 +77,7 @@ def load_concise(tokenizer, cfg):
 
 def load_qa(tokenizer, cfg):
     return AlpacaQAPromptTokenizingStrategy(
-        AlpacaPrompter(PromptStyle.CHAT.value),
+        AlpacaChatPrompter(),
         tokenizer,
         cfg.train_on_inputs,
         cfg.sequence_len,
@@ -73,7 +86,7 @@ def load_qa(tokenizer, cfg):
 
 def load_camel_ai(tokenizer, cfg):
     return CamelAIPromptTokenizingStrategy(
-        AlpacaPrompter(PromptStyle.CHAT.value),
+        AlpacaChatPrompter(),
         tokenizer,
         cfg.train_on_inputs,
         cfg.sequence_len,

From dc77c8ebce8ec4135f4e0c03a9d336b3f0957358 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Tue, 13 Jun 2023 12:01:46 +0900
Subject: [PATCH 18/55] chore: Refactor inf_kwargs out

---
 scripts/finetune.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 283100c8a..785f3cf23 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -63,7 +63,7 @@ def get_multi_line_input() -> Optional[str]:
     return instruction
 
 
-def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
+def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
     default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
 
     for token, symbol in default_tokens.items():
@@ -257,13 +257,13 @@ def train(
 
     if cfg.inference:
         logging.info("calling do_inference function")
-        inf_kwargs: Dict[str, Any] = {}
+        prompter: Optional[str] = "AlpacaPrompter"
         if "prompter" in kwargs:
             if kwargs["prompter"] == "None":
-                inf_kwargs["prompter"] = None
+                prompter = None
             else:
-                inf_kwargs["prompter"] = kwargs["prompter"]
-        do_inference(cfg, model, tokenizer, **inf_kwargs)
+                prompter = kwargs["prompter"]
+        do_inference(cfg, model, tokenizer, prompter=prompter)
         return
 
     if "shard" in kwargs:

From 5ff547dc703e7dfc09e56baf5fe2749e56076961 Mon Sep 17 00:00:00 2001
From: PocketDoc Labs <visuallyadequate@gmail.com>
Date: Mon, 12 Jun 2023 22:38:10 -0700
Subject: [PATCH 19/55] Update README.md to include a community showcase

---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 3126d81f3..c4e1887e9 100644
--- a/README.md
+++ b/README.md
@@ -552,6 +552,16 @@ Building something cool with Axolotl? Consider adding a badge to your model card
 
 [<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
 
+## Community Showcase
+
+Open Access AI Collective
+- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b)
+- [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
+- [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)
+
+PocketDoc Labs
+- [Dan's PersonalityEngine 13b LoRA](https://huggingface.co/PocketDoc/Dans-PersonalityEngine-13b-LoRA)
+
 ## Contributing 🤝
 
 Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).

From 3513885f434a1668754883adc3a050fe658c4d8f Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Wed, 14 Jun 2023 01:10:58 +0900
Subject: [PATCH 20/55] Fix sharegpt type

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c4e1887e9..5a00cccac 100644
--- a/README.md
+++ b/README.md
@@ -138,7 +138,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
   ```json
   {"instruction": "...", "input": "...", "output": "..."}
   ```
-- `sharegpt`: conversations
+- `sharegpt:chat`: conversations
   ```json
   {"conversations": [{"from": "...", "value": "..."}]}
   ```

From 556fe408b3ac9117b825705d5f08982377377dd8 Mon Sep 17 00:00:00 2001
From: "maciej.karasek" <103371156+MaciejKarasek@users.noreply.github.com>
Date: Wed, 14 Jun 2023 16:59:57 +0200
Subject: [PATCH 21/55] issue #205 bugfix

---
 src/axolotl/utils/models.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 05acfce93..103c707f2 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -252,11 +252,11 @@ def load_model(
             )
             # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
             # when training starts
-            if hasattr(config, "max_seq_len") and cfg.sequence_len > config.max_seq_len:
+            if hasattr(config, "max_seq_len") and config.max_seq_len and cfg.sequence_len > config.max_seq_len:
                 config.max_seq_len = cfg.sequence_len
                 logging.warning(f"increasing context length to {cfg.sequence_len}")
             elif (
-                hasattr(config, "max_sequence_length")
+                hasattr(config, "max_sequence_length") and config.max_sequence_length
                 and cfg.sequence_len > config.max_sequence_length
             ):
                 config.max_sequence_length = cfg.sequence_len
@@ -289,7 +289,7 @@ def load_model(
     model.resize_token_embeddings(embeddings_len)
 
     if (
-        hasattr(model.config, "max_position_embeddings")
+        hasattr(model.config, "max_position_embeddings") and model.config.max_position_embeddings
         and cfg.sequence_len >= model.config.max_position_embeddings
     ):
         logging.warning(

From 136522f9c9bbb4658f9ebaa5f528366d9c15b2ae Mon Sep 17 00:00:00 2001
From: "maciej.karasek" <103371156+MaciejKarasek@users.noreply.github.com>
Date: Wed, 14 Jun 2023 20:02:09 +0200
Subject: [PATCH 22/55] style correction

---
 src/axolotl/utils/models.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 103c707f2..c6d380267 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -252,11 +252,16 @@ def load_model(
             )
             # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
             # when training starts
-            if hasattr(config, "max_seq_len") and config.max_seq_len and cfg.sequence_len > config.max_seq_len:
+            if (
+                hasattr(config, "max_seq_len")
+                and config.max_seq_len
+                and cfg.sequence_len > config.max_seq_len
+            ):
                 config.max_seq_len = cfg.sequence_len
                 logging.warning(f"increasing context length to {cfg.sequence_len}")
             elif (
-                hasattr(config, "max_sequence_length") and config.max_sequence_length
+                hasattr(config, "max_sequence_length")
+                and config.max_sequence_length
                 and cfg.sequence_len > config.max_sequence_length
             ):
                 config.max_sequence_length = cfg.sequence_len
@@ -289,7 +294,8 @@ def load_model(
     model.resize_token_embeddings(embeddings_len)
 
     if (
-        hasattr(model.config, "max_position_embeddings") and model.config.max_position_embeddings
+        hasattr(model.config, "max_position_embeddings")
+        and model.config.max_position_embeddings
         and cfg.sequence_len >= model.config.max_position_embeddings
     ):
         logging.warning(

From 945c4191a33753fee06d04b7ab3005df91b0feaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Steffen=20R=C3=B6cker?= <sroecker@gmail.com>
Date: Wed, 14 Jun 2023 20:09:26 +0200
Subject: [PATCH 23/55] Use AutoTokenizer for redpajama example

---
 examples/redpajama/config-3b.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/redpajama/config-3b.yml b/examples/redpajama/config-3b.yml
index e7342b2f7..869c0883e 100644
--- a/examples/redpajama/config-3b.yml
+++ b/examples/redpajama/config-3b.yml
@@ -1,7 +1,7 @@
 base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
 base_model_config: togethercomputer/RedPajama-INCITE-Chat-3B-v1
 model_type: GPTNeoXForCausalLM
-tokenizer_type: GPTNeoXTokenizer
+tokenizer_type: AutoTokenizer
 trust_remote_code:
 load_in_8bit: false
 datasets:

From 7925ddce866daa03b6df9b044b1b8f4222fd5edf Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 01:59:33 -0400
Subject: [PATCH 24/55] bugfix for potential off by one

---
 src/axolotl/prompt_strategies/alpaca_chat.py | 12 +++++
 src/axolotl/prompt_tokenizers.py             | 32 +++++++-------
 tests/test_prompt_tokenizers.py              | 46 ++++++++++++++++++--
 3 files changed, 72 insertions(+), 18 deletions(-)

diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py
index 1183c1e8e..6161d7e37 100644
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -40,6 +40,18 @@ class AlpacaChatPrompter(AlpacaPrompter):
         self.match_prompt_style()
 
 
+class NoSystemPrompter(AlpacaPrompter):
+    """
+    Null Prompter with no system prompts
+    """
+
+    prompt_input = "{instruction} {input} "
+    prompt_no_input = "{instruction} "
+
+    def __init__(self):  # pylint: disable=super-init-not-called
+        pass
+
+
 class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
     """
     Tokenizing strategy for AlpacaQA
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index 8b3c88fee..6408620d7 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -96,25 +96,27 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
             input,  # pylint: disable=redefined-builtin
             response,
         ) = self.parse_instruction_fields(prompt)
-        full_prompt = self._build_full_prompt(instruction, input, response)
-        tokenized_full_prompt = self._tokenize(full_prompt)
-        if not self.train_on_inputs:
-            user_prompt = next(
-                iter(
-                    self.prompter.build_prompt(
-                        instruction,
-                        input,
-                    )
+        user_prompt = next(
+            iter(
+                self.prompter.build_prompt(
+                    instruction,
+                    input,
                 )
             )
-            tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False)
-            user_prompt_len = len(tokenized_user_prompt["input_ids"])
+        )
+        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
+        if not self.train_on_inputs:
+            user_prompt_len = len(tokenized_prompt["input_ids"])
             # TODO this could be sped up using numpy array slicing
-            tokenized_full_prompt["labels"] = [
-                -100
-            ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]
+            tokenized_prompt["labels"] = [-100] * user_prompt_len
+        tokenized_res_prompt = self._tokenize(
+            response, strip_bos_token=True, add_eos_token=True
+        )
+        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
+        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
+        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
 
-        return tokenized_full_prompt
+        return tokenized_prompt
 
     def _build_full_prompt(
         self, instruction, input, response  # pylint: disable=redefined-builtin
diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py
index 89209e84f..abc746bbf 100644
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -6,8 +6,12 @@ from pathlib import Path
 
 from transformers import AutoTokenizer
 
-from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
-from axolotl.prompters import ShareGPTPrompter
+from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
+from axolotl.prompt_tokenizers import (
+    AlpacaPromptTokenizingStrategy,
+    ShareGPTPromptTokenizingStrategy,
+)
+from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter
 
 logging.basicConfig(level="INFO")
 
@@ -29,7 +33,6 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
         )
 
     def test_sharegpt_integration(self):
-        print(Path(__file__).parent)
         with open(
             Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8"
         ) as fin:
@@ -53,6 +56,43 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
             self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
             self.assertEqual(example[fields], tokenized_conversation[fields])
 
+    def test_completion(self):
+        """
+        tests the interface between the user and assistant parts
+        """
+        prompter = NoSystemPrompter()
+        strat = AlpacaPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        sample = {
+            "instruction": "hello cruel. lorem ipsum dolor sit amet.",
+            "output": "world!",
+        }
+        example = strat.tokenize_prompt(sample)
+        world_idx = example["input_ids"].index(3186)
+        assert example["labels"][world_idx] == 3186
+        assert example["labels"][world_idx - 1] == -100
+
+    def test_alpaca(self):
+        """
+        tests the interface between the user and assistant parts
+        """
+        prompter = AlpacaPrompter()
+        strat = AlpacaPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        sample = {"instruction": "hello!", "output": "Hi! How can I help?"}
+        example = strat.tokenize_prompt(sample)
+        world_idx = example["input_ids"].index(6324)
+        assert example["labels"][world_idx] == 6324
+        assert example["labels"][world_idx - 1] == -100
+
 
 if __name__ == "__main__":
     unittest.main()

From baed440fa16552ea32bebfea30c389fcadda6d33 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 02:03:53 -0400
Subject: [PATCH 25/55] ingore duplicate code in tests

---
 tests/test_prompt_tokenizers.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py
index abc746bbf..8d9635c0e 100644
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -61,6 +61,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
         tests the interface between the user and assistant parts
         """
         prompter = NoSystemPrompter()
+        # pylint: disable=duplicate-code
         strat = AlpacaPromptTokenizingStrategy(
             prompter,
             self.tokenizer,
@@ -80,6 +81,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
         """
         tests the interface between the user and assistant parts
         """
+        # pylint: disable=duplicate-code
         prompter = AlpacaPrompter()
         strat = AlpacaPromptTokenizingStrategy(
             prompter,

From 88e17ffc500173d8b6baae50195409edfc9a10ea Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 00:26:44 -0400
Subject: [PATCH 26/55] add float16 docs and tweak typehints

---
 README.md                   | 8 ++++++++
 src/axolotl/utils/models.py | 8 +++++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e267a9d6d..225ef0dd7 100644
--- a/README.md
+++ b/README.md
@@ -264,6 +264,8 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
   bf16: true # require >=ampere
   fp16: true
   tf32: true # require >=ampere
+  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP
+  float16: true # use instead of fp16 when you don't want AMP
   ```
   Note: Repo does not do 4-bit quantization.
 
@@ -522,6 +524,12 @@ Add below flag to train command above
 --merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False
 ```
 
+If you run out of CUDA memory, you can try to merge in system RAM with
+
+```bash
+CUDA_VISIBLE_DEVICES="" python3 scripts/finetune.py ...
+```
+
 ## Common Errors 🧰
 
 > Cuda out of memory
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index c6d380267..2ae9a26aa 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -11,13 +11,14 @@ import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
-from transformers import PreTrainedModel  # noqa: F401
-from transformers import (
+from transformers import (  # noqa: F401
     AutoConfig,
     AutoModelForCausalLM,
     AutoTokenizer,
     BitsAndBytesConfig,
     LlamaConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
 )
 
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
@@ -71,7 +72,7 @@ def load_tokenizer(
 def load_model(
     base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
 ):
-    # type: (str, str, str, AutoTokenizer, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+    # type: (str, str, str, PreTrainedTokenizerBase, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
     """
     Load a model from a base model and a model type.
     """
@@ -284,6 +285,7 @@ def load_model(
         model = AutoModelForCausalLM.from_pretrained(
             base_model,
             load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
+            load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
             torch_dtype=torch_dtype,
             device_map=cfg.device_map,
             trust_remote_code=cfg.trust_remote_code or False,

From d7635b71486c65629f2ec1e4fe8c70396366aa96 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 02:06:27 -0400
Subject: [PATCH 27/55] hint to what AMP means

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 225ef0dd7..d6c9cfefb 100644
--- a/README.md
+++ b/README.md
@@ -264,7 +264,7 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
   bf16: true # require >=ampere
   fp16: true
   tf32: true # require >=ampere
-  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP
+  bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
   float16: true # use instead of fp16 when you don't want AMP
   ```
   Note: Repo does not do 4-bit quantization.

From 1ab3bf3e6772be2165a8504430c61d0d1b55e32f Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 02:09:33 -0400
Subject: [PATCH 28/55] fix test name

---
 tests/test_prompt_tokenizers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py
index 8d9635c0e..aba340eee 100644
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -56,7 +56,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
             self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
             self.assertEqual(example[fields], tokenized_conversation[fields])
 
-    def test_completion(self):
+    def test_no_sys_prompt(self):
         """
         tests the interface between the user and assistant parts
         """

From 6d0ee4ba34fbf20e9846ce24875448019f8dba65 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 08:40:41 -0400
Subject: [PATCH 29/55] support adamw and grad norm hyperparams

---
 src/axolotl/utils/trainer.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 5152e649b..5cf3107f3 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -115,6 +115,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
             # TODO search Path("./") for one
             training_arguments_kwargs["deepspeed"] = "./ds_config.json"
 
+    if cfg.adam_beta1:
+        training_arguments_kwargs["adam_beta1"] = cfg.adam_beta1
+    if cfg.adam_beta2:
+        training_arguments_kwargs["adam_beta2"] = cfg.adam_beta2
+    if cfg.adam_epsilon:
+        training_arguments_kwargs["adam_epsilon"] = cfg.adam_epsilon
+    if cfg.max_grad_norm:
+        training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
+
     training_args = transformers.TrainingArguments(
         per_device_train_batch_size=cfg.micro_batch_size,
         per_device_eval_batch_size=cfg.eval_batch_size

From c969f0a9dc28c9f095a2bb6b3ecede0216d909b5 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 08:43:20 -0400
Subject: [PATCH 30/55] add docs

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index d6c9cfefb..5fbac1a48 100644
--- a/README.md
+++ b/README.md
@@ -422,6 +422,12 @@ log_sweep_max_lr:
 optimizer:
 # specify weight decay
 weight_decay:
+# adamw hyperparams
+adam_beta1:
+adam_beta2:
+adam_epsilon:
+# Gradient clipping max norm
+max_grad_norm:
 
 # whether to bettertransformers
 flash_optimum:

From cb9d3af5c00e0189f95c03d64efdc283aec54679 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 09:39:42 -0400
Subject: [PATCH 31/55] add validation and tests for adamw hyperparam

---
 src/axolotl/utils/validation.py |  5 ++++
 tests/test_validation.py        | 42 +++++++++++++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index 298d36c4e..2e0da69b3 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -87,6 +87,11 @@ def validate_config(cfg):
             "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
         )
 
+    if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
+        not cfg.optimizer or "adamw" not in cfg.optimizer
+    ):
+        logging.warning("adamw hyperparameters found, but no adamw optimizer set")
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25
diff --git a/tests/test_validation.py b/tests/test_validation.py
index dba54586e..cc6d29a23 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -263,3 +263,45 @@ class ValidationTest(unittest.TestCase):
 
         with pytest.raises(ValueError, match=regex_exp):
             validate_config(cfg)
+
+    def test_adamw_hyperparams(self):
+        cfg = DictDefault(
+            {
+                "optimizer": None,
+                "adamw_epsilon": 0.0001,
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "adamw hyperparameters found, but no adamw optimizer set"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "optimizer": "adafactor",
+                "adamw_beta1": 0.0001,
+            }
+        )
+
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "adamw hyperparameters found, but no adamw optimizer set"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "optimizer": "adamw_bnb_8bit",
+                "adamw_beta1": 0.0001,
+                "adamw_beta2": 0.0001,
+                "adamw_epsilon": 0.0001,
+            }
+        )
+
+        validate_config(cfg)

From ad5ca4f734721d66b9c10a58ba7141bf13694452 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 10:12:47 -0400
Subject: [PATCH 32/55] Additional test case per pr

---
 tests/test_validation.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_validation.py b/tests/test_validation.py
index cc6d29a23..d39a4618e 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -305,3 +305,11 @@ class ValidationTest(unittest.TestCase):
         )
 
         validate_config(cfg)
+
+        cfg = DictDefault(
+            {
+                "optimizer": "adafactor",
+            }
+        )
+
+        validate_config(cfg)

From d35278aaf1b5829747ee8dbc1952c357bc4d1c6b Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 15 Jun 2023 16:01:27 -0400
Subject: [PATCH 33/55] don't fail fast

---
 .github/workflows/base.yml  | 1 +
 .github/workflows/main.yml  | 1 +
 .github/workflows/tests.yml | 1 +
 3 files changed, 3 insertions(+)

diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index c5a70978b..623083db2 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -12,6 +12,7 @@ jobs:
     # this job needs to be run on self-hosted GPU runners...
     runs-on: self-hosted
     strategy:
+      fail-fast: false
       matrix:
         include:
           - cuda: "118"
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 4e7705b7d..033199154 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -11,6 +11,7 @@ jobs:
     if: github.repository_owner == 'OpenAccess-AI-Collective'
     # this job needs to be run on self-hosted GPU runners...
     strategy:
+      fail-fast: false
       matrix:
         include:
           - cuda: cu118
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 0fc7ac9d9..d5184def6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -7,6 +7,7 @@ jobs:
   test:
     runs-on: ubuntu-latest
     strategy:
+      fail-fast: false
       matrix:
         python_version: ["3.9", "3.10"]
     timeout-minutes: 10

From 9bdd30cdfdfad725b03620fdb933689fe1b828d5 Mon Sep 17 00:00:00 2001
From: Utensil <utensilcandel@gmail.com>
Date: Wed, 21 Jun 2023 08:00:58 +0000
Subject: [PATCH 34/55] Support loading data files from a local directory

ref:  https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path
---
 src/axolotl/utils/data.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index c36bfcee9..eed7d6db1 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets(
                 pass
 
             # prefer local dataset, even if hub exists
-            if Path(d.path).exists():
-                ds = load_dataset(
-                    "json",
-                    data_files=d.path,
-                    streaming=False,
-                    split=None,
-                )
+            local_path = Path(d.path)
+            if local_path.exists():
+                if local_path.is_dir():
+                    ds = load_dataset(
+                        d.path,
+                        data_files=d.data_files,
+                        streaming=False,
+                        split=None,
+                    )
+                elif local_path.is_file():
+                    ds = load_dataset(
+                        "json",
+                        data_files=d.path,
+                        streaming=False,
+                        split=None,
+                    )
+                else:
+                    raise ValueError(
+                        "unhandled dataset load: local path exists, but is neither a directory or a file"
+                    )
             elif ds_from_hub:
                 if d.data_files:
                     ds = load_dataset(

From 0aeb7c7802fa59586860035e9bbff9f25aabb211 Mon Sep 17 00:00:00 2001
From: Mahesh Sinha <mahesh.sinha@workday.com>
Date: Wed, 21 Jun 2023 15:34:48 +0200
Subject: [PATCH 35/55] Fixing Data Readme

---
 data/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/data/README.md b/data/README.md
index 34d7a5659..c452ece7c 100644
--- a/data/README.md
+++ b/data/README.md
@@ -10,10 +10,10 @@ curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarit
 ## Convert the JSON data files to JSONL.
 
 ```shell
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/alpaca_data_gpt4.json > data/alpaca_data_gpt4.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/vicuna_cleaned.json > data/vicuna_cleaned.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/roleplay-similarity_0.6-instruct-dataset.json > data/roleplay-similarity_0.6-instruct-dataset.jsonl
-python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/gpt4-instruct-similarity-0.6-dataset.json > data/gpt4-instruct-similarity-0.6-dataset.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl
+python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl
 ```
 ---
 

From 47d601fa2389a7f7a0dac0bd767e669c3a326cbe Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 25 Jun 2023 10:19:49 -0400
Subject: [PATCH 36/55] optionally define whether to use_fast tokenizer

---
 README.md                   |  2 ++
 src/axolotl/utils/models.py |  5 +++++
 tests/test_tokenizers.py    | 31 +++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+)
 create mode 100644 tests/test_tokenizers.py

diff --git a/README.md b/README.md
index 5fbac1a48..047d6aa34 100644
--- a/README.md
+++ b/README.md
@@ -302,6 +302,8 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Trust remote code for untrusted source
 trust_remote_code:
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast:
 
 # whether you are training a 4-bit GPTQ quantized model
 gptq: true
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 2ae9a26aa..6d94cd674 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -34,15 +34,20 @@ def load_tokenizer(
     tokenizer_type,
     cfg,
 ):
+    use_fast = True  # this is the default
+    if cfg.tokenizer_use_fast is not None:
+        use_fast = cfg.tokenizer_use_fast
     if tokenizer_type:
         tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
             tokenizer_config,
             trust_remote_code=cfg.trust_remote_code or False,
+            use_fast=use_fast,
         )
     else:
         tokenizer = AutoTokenizer.from_pretrained(
             tokenizer_config,
             trust_remote_code=cfg.trust_remote_code or False,
+            use_fast=use_fast,
         )
 
     logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py
new file mode 100644
index 000000000..f2521e8e7
--- /dev/null
+++ b/tests/test_tokenizers.py
@@ -0,0 +1,31 @@
+"""
+Test cases for the tokenizer loading
+"""
+import unittest
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.models import load_tokenizer
+
+
+class TestTokenizers(unittest.TestCase):
+    """
+    test class for the load_tokenizer fn
+    """
+
+    def test_default_use_fast(self):
+        cfg = DictDefault({})
+        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        assert "Fast" in tokenizer.__class__.__name__
+
+    def test_dont_use_fast(self):
+        cfg = DictDefault(
+            {
+                "tokenizer_use_fast": False,
+            }
+        )
+        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        assert "Fast" not in tokenizer.__class__.__name__
+
+
+if __name__ == "__main__":
+    unittest.main()

From 645c13592c06f653fd6337194d20dddba8ae8bf2 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 25 Jun 2023 10:26:02 -0400
Subject: [PATCH 37/55] better py3 support w pre-commit

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b0eb2db49..c811a6eb3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,5 +1,5 @@
 default_language_version:
-    python: python3.9
+    python: python3
 
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks

From 8d20e0a3d3f44721bb3e45f4a6d51577dd7099bc Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 17 Jun 2023 19:22:58 -0400
Subject: [PATCH 38/55] initial wip to get sys prompt from dataset

---
 src/axolotl/prompt_strategies/alpaca_chat.py |  6 +-
 src/axolotl/prompt_tokenizers.py             |  4 +-
 src/axolotl/prompters.py                     | 87 ++++++++++++--------
 tests/test_prompters.py                      | 69 +++++++++++++++-
 4 files changed, 126 insertions(+), 40 deletions(-)

diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py
index 6161d7e37..32801c3c3 100644
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -45,8 +45,10 @@ class NoSystemPrompter(AlpacaPrompter):
     Null Prompter with no system prompts
     """
 
-    prompt_input = "{instruction} {input} "
-    prompt_no_input = "{instruction} "
+    system_prompt = ""
+    system_no_input_prompt = ""
+    turn_format = "{instruction} {input} "
+    turn_no_input_format = "{instruction} "
 
     def __init__(self):  # pylint: disable=super-init-not-called
         pass
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index 6408620d7..cf80539eb 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -87,7 +87,9 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
     Tokenizing strategy for instruction-based prompts.
     """
 
-    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
+    def parse_instruction_fields(
+        self, prompt
+    ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
         raise NotImplementedError
 
     def tokenize_prompt(self, prompt):
diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index 29cc4446b..4db915238 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -24,6 +24,8 @@ class AlpacaPrompter:
 
     system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
     system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
+    turn_format: str
+    turn_no_input_format: str
     prompt_style: Optional[PromptStyle] = None
 
     def __init__(self, prompt_style=PromptStyle.INSTRUCT.value):
@@ -32,23 +34,13 @@ class AlpacaPrompter:
 
     def match_prompt_style(self):
         if self.prompt_style == PromptStyle.INSTRUCT.value:
-            self.prompt_input = (
-                self.system_prompt
-                + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
+            self.turn_no_input_format = (
+                "### Instruction:\n{instruction}\n\n### Response:\n"
             )
-            self.prompt_no_input = (
-                self.system_no_input_prompt
-                + "### Instruction:\n{instruction}\n\n### Response:\n"
-            )
-            self.response_split = "### Response:"
         if self.prompt_style == PromptStyle.CHAT.value:
-            self.prompt_input = (
-                self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
-            )
-            self.prompt_no_input = (
-                self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
-            )
-            self.response_split = "ASSISTANT:"
+            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
 
     def build_prompt(
         self,
@@ -59,15 +51,39 @@ class AlpacaPrompter:
         # returns the full prompt from instruction and optional input
         # if a label (=response, =output) is provided, it's also appended.
         if input:
-            res = self.prompt_input.format(instruction=instruction, input=input)
+            res = self.system_prompt + self.turn_format.format(
+                instruction=instruction, input=input
+            )
         else:
-            res = self.prompt_no_input.format(instruction=instruction)
+            res = self.system_no_input_prompt + self.turn_no_input_format.format(
+                instruction=instruction
+            )
         if output:
             res = f"{res}{output}"
         yield res
 
-    def get_response(self, output: str) -> str:
-        return output.split(self.response_split)[1].strip()
+
+class SystemDataPrompter(AlpacaPrompter):
+    """
+    Alpaca Style Prompter that uses system prompts from the dataset
+    """
+
+    def build_prompt_w_system(
+        self,
+        system: str,
+        instruction: str,
+        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        output: Union[None, str] = None,
+    ) -> Generator[str, None, None]:
+        # returns the full prompt from instruction and optional input
+        # if a label (=response, =output) is provided, it's also appended.
+        if input:
+            res = system + self.turn_format.format(instruction=instruction, input=input)
+        else:
+            res = system + self.turn_no_input_format.format(instruction=instruction)
+        if output:
+            res = f"{res}{output}"
+        yield res
 
 
 class UnpromptedPrompter(AlpacaPrompter):
@@ -93,7 +109,10 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter):
     """
 
     system_prompt = (
-        "Choose the answer that best answers the question. Explain your reasoning."
+        "Choose the answer that best answers the question. Explain your reasoning.\n"
+    )
+    system_no_input_prompt = (
+        "Choose the answer that best answers the question. Explain your reasoning.\n"
     )
 
 
@@ -102,7 +121,12 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter):
     Prompter for multiple choice concise
     """
 
-    prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"
+    system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
+    system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
+
+    def match_prompt_style(self):
+        self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
+        self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
 
 
 class SummarizeTLDRPrompter(AlpacaPrompter):
@@ -110,9 +134,12 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
     Prompter for summarize TLDR
     """
 
-    prompt_no_input = (
-        "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
-    )
+    system_prompt = ""
+    system_no_input_prompt = ""
+
+    def match_prompt_style(self):
+        self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
+        self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
 
 
 class CompletionPrompter:
@@ -128,9 +155,6 @@ class CompletionPrompter:
     ) -> Generator[str, None, None]:
         yield instruction
 
-    def get_response(self, output: str) -> str:
-        return output.strip()
-
 
 class GPTeacherPrompter(AlpacaPrompter):
     """
@@ -210,9 +234,6 @@ class ReflectAlpacaPrompter:
             res = f"{res}{label}"
         yield res
 
-    def get_response(self, output: str) -> str:
-        return output.split(self.response_split)[1].strip()
-
 
 class SeparatorStyle(Enum):
     """Different separator style."""
@@ -289,12 +310,6 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
             sep2=" ",
         )
 
-    # def match_prompt_style(self):
-    #     if self.prompt_style == PromptStyle.chat.value:
-    #         self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
-    #         self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
-    #         self.response_split = "ASSISTANT:"
-
     def build_prompt(self, source) -> Generator[str, None, None]:
         # ignore the system prompt if provided
         if source[0]["from"] == "system":
diff --git a/tests/test_prompters.py b/tests/test_prompters.py
index 11610ccc5..bb33afbb6 100644
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -2,7 +2,13 @@
 
 import unittest
 
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import (
+    AlpacaPrompter,
+    MultipleChoiceExplainPrompter,
+    PromptStyle,
+    SystemDataPrompter,
+    UnpromptedPrompter,
+)
 
 
 class AlpacaPrompterTest(unittest.TestCase):
@@ -55,3 +61,64 @@ class AlpacaPrompterTest(unittest.TestCase):
         assert "### Response:" not in res
         assert "USER:" in res
         assert "ASSISTANT:" in res
+
+    def test_system_prompt(self):
+        prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(
+            prompter.build_prompt_w_system(
+                "use cot", "tell me a joke about the following", "alpacas"
+            )
+        )
+        assert "use cot" in res
+        assert res.startswith("use cot")
+        assert "### Instruction:" not in res
+        assert "### Input:" not in res
+        assert "alpacas" in res
+        assert "### Response:" not in res
+        assert "USER:" in res
+        assert "ASSISTANT:" in res
+
+
+class UnpromptedPrompterTest(unittest.TestCase):
+    """
+    Test class for UnpromptedPrompter with no system prompts
+    """
+
+    def test_prompt_style_w_none(self):
+        prompter = UnpromptedPrompter(prompt_style=None)
+        res = next(prompter.build_prompt("tell me a joke"))
+        assert "### Instruction:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("###")
+
+    def test_prompt_style_w_instruct(self):
+        prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
+        res = next(
+            prompter.build_prompt("tell me a joke about the following", "alpacas")
+        )
+        assert "### Instruction:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("###")
+
+    def test_prompt_style_w_chat(self):
+        prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(
+            prompter.build_prompt("tell me a joke about the following", "alpacas")
+        )
+        assert "USER:" in res
+        assert "tell me a joke" in res
+        assert res.startswith("USER:")
+
+
+class MultipleChoiceExplainPrompterTest(unittest.TestCase):
+    """
+    Test class for MultipleChoiceExplainPrompter
+    """
+
+    def test_prompt_style_w_chat(self):
+        prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
+        res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
+        assert "USER:" in res
+        assert "choose one" in res
+        assert "Choose the answer that best answers the question." in res
+        assert "- A\n- B\n- C" in res

From 3a38271276224741fc9b2766b322a9bc54bba9c3 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 17 Jun 2023 23:52:40 -0400
Subject: [PATCH 39/55] add tests and supoort for loader for sys prompt data

---
 .../prompt_strategies/alpaca_w_system.py      | 83 +++++++++++++++++++
 src/axolotl/prompters.py                      | 23 -----
 src/axolotl/utils/tokenization.py             |  2 +
 tests/test_prompt_tokenizers.py               | 40 ++++++++-
 tests/test_prompters.py                       |  2 +-
 5 files changed, 125 insertions(+), 25 deletions(-)
 create mode 100644 src/axolotl/prompt_strategies/alpaca_w_system.py

diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py
new file mode 100644
index 000000000..88acf0d0e
--- /dev/null
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -0,0 +1,83 @@
+"""
+Prompt strategies loader for alpaca instruction datasets with system prompts
+"""
+from typing import Generator, Tuple, Union
+
+from axolotl.prompt_tokenizers import PromptTokenizingStrategy
+from axolotl.prompters import AlpacaPrompter, PromptStyle
+
+
+class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
+    """
+    Tokenizing strategy for instruction-based prompts.
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
+        return (
+            prompt["instruction"],
+            prompt["input"] if "input" in prompt else "",
+            prompt["output"],
+            prompt["system"],
+        )
+
+    def tokenize_prompt(self, prompt):
+        (
+            instruction,
+            input,  # pylint: disable=redefined-builtin
+            response,
+            system,
+        ) = self.parse_instruction_fields(prompt)
+        user_prompt = next(
+            iter(
+                self.prompter.build_prompt_w_system(
+                    system,
+                    instruction,
+                    input,
+                )
+            )
+        )
+        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
+        if not self.train_on_inputs:
+            user_prompt_len = len(tokenized_prompt["input_ids"])
+            # TODO this could be sped up using numpy array slicing
+            tokenized_prompt["labels"] = [-100] * user_prompt_len
+        tokenized_res_prompt = self._tokenize(
+            response, strip_bos_token=True, add_eos_token=True
+        )
+        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
+        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
+        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
+
+        return tokenized_prompt
+
+
+class SystemDataPrompter(AlpacaPrompter):
+    """
+    Alpaca Style Prompter that uses system prompts from the dataset
+    """
+
+    def build_prompt_w_system(
+        self,
+        system: str,
+        instruction: str,
+        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        output: Union[None, str] = None,
+    ) -> Generator[str, None, None]:
+        # returns the full prompt from instruction and optional input
+        # if a label (=response, =output) is provided, it's also appended.
+        if input:
+            res = system + self.turn_format.format(instruction=instruction, input=input)
+        else:
+            res = system + self.turn_no_input_format.format(instruction=instruction)
+        if output:
+            res = f"{res}{output}"
+        yield res
+
+
+def load(tokenizer, cfg):
+    return InstructionWSystemPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index 4db915238..715a227c8 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -63,29 +63,6 @@ class AlpacaPrompter:
         yield res
 
 
-class SystemDataPrompter(AlpacaPrompter):
-    """
-    Alpaca Style Prompter that uses system prompts from the dataset
-    """
-
-    def build_prompt_w_system(
-        self,
-        system: str,
-        instruction: str,
-        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
-        output: Union[None, str] = None,
-    ) -> Generator[str, None, None]:
-        # returns the full prompt from instruction and optional input
-        # if a label (=response, =output) is provided, it's also appended.
-        if input:
-            res = system + self.turn_format.format(instruction=instruction, input=input)
-        else:
-            res = system + self.turn_no_input_format.format(instruction=instruction)
-        if output:
-            res = f"{res}{output}"
-        yield res
-
-
 class UnpromptedPrompter(AlpacaPrompter):
     """
     Prompter for alpaca no system prompt
diff --git a/src/axolotl/utils/tokenization.py b/src/axolotl/utils/tokenization.py
index 1c535eb1b..7d0d1dd83 100644
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -34,3 +34,5 @@ def check_example_labels(example, tokenizer):
 
     logging.info(" ".join(colored_tokens))
     logging.info("\n\n\n")
+
+    return " ".join(colored_tokens)
diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py
index aba340eee..3ddbe77bf 100644
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -7,11 +7,15 @@ from pathlib import Path
 from transformers import AutoTokenizer
 
 from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
+from axolotl.prompt_strategies.alpaca_w_system import (
+    InstructionWSystemPromptTokenizingStrategy,
+    SystemDataPrompter,
+)
 from axolotl.prompt_tokenizers import (
     AlpacaPromptTokenizingStrategy,
     ShareGPTPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter
+from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter
 
 logging.basicConfig(level="INFO")
 
@@ -96,5 +100,39 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
         assert example["labels"][world_idx - 1] == -100
 
 
+class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
+    """
+    Test class for prompt tokenization strategies with sys prompt from the dataset
+    """
+
+    def setUp(self) -> None:
+        # pylint: disable=duplicate-code
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(
+            {
+                "bos_token": "<s>",
+                "eos_token": "</s>",
+                "unk_token": "<unk>",
+            }
+        )
+
+    def test_system_alpaca(self):
+        prompter = SystemDataPrompter(PromptStyle.CHAT.value)
+        strat = InstructionWSystemPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        sample = {
+            "system": "use cot",
+            "instruction": "hello!",
+            "output": "Hi! How can I help?",
+        }
+        example = strat.tokenize_prompt(sample)
+        assert example["input_ids"][0:3] == [1, 671, 20118]  # <s>use cot
+        assert example["input_ids"][3] == 11889  # USER
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_prompters.py b/tests/test_prompters.py
index bb33afbb6..756b6f81b 100644
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -2,11 +2,11 @@
 
 import unittest
 
+from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
 from axolotl.prompters import (
     AlpacaPrompter,
     MultipleChoiceExplainPrompter,
     PromptStyle,
-    SystemDataPrompter,
     UnpromptedPrompter,
 )
 

From 7b57ed761882b4492659eeafffbf8ffddd3f0fbb Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 18 Jun 2023 06:40:28 -0400
Subject: [PATCH 40/55] pylint for duplicated code for system prompts

---
 src/axolotl/datasets.py                          | 1 +
 src/axolotl/prompt_strategies/alpaca_w_system.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py
index 40c58bc9c..5593a8dd3 100644
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -126,6 +126,7 @@ class ConstantLengthDataset(IterableDataset):
                     buffer_len = 0
 
                 if example:
+                    # FIXME
                     # just going to drop data points that are too long
                     if len(example["input_ids"]) <= self.seq_length:
                         input_ids = example["input_ids"]
diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py
index 88acf0d0e..aacae8739 100644
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -21,6 +21,7 @@ class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
         )
 
     def tokenize_prompt(self, prompt):
+        # pylint: disable=duplicate-code
         (
             instruction,
             input,  # pylint: disable=redefined-builtin

From 05ab9092e304f234801d6496cecb60d49d86c0a4 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 25 Jun 2023 22:40:50 -0400
Subject: [PATCH 41/55] skip the system prompt

---
 src/axolotl/prompt_strategies/alpaca_instruct.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/prompt_strategies/alpaca_instruct.py b/src/axolotl/prompt_strategies/alpaca_instruct.py
index 2e42191f8..143f070f2 100644
--- a/src/axolotl/prompt_strategies/alpaca_instruct.py
+++ b/src/axolotl/prompt_strategies/alpaca_instruct.py
@@ -1,7 +1,7 @@
 """Module loading the AlpacaInstructPromptTokenizingStrategy class"""
 
 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
 
 
 def load(tokenizer, cfg):
@@ -11,3 +11,12 @@ def load(tokenizer, cfg):
         cfg.train_on_inputs,
         cfg.sequence_len,
     )
+
+
+def load_no_prompt(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        UnpromptedPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )

From 612aabd8c468b6f1aeda80fdec5ec4a4bc3ae159 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 27 Jun 2023 15:40:25 -0400
Subject: [PATCH 42/55] push intermediate model checkpoints to hub

---
 src/axolotl/prompt_strategies/alpaca_chat.py | 11 ++++++++++-
 src/axolotl/utils/trainer.py                 |  4 ++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py
index 6161d7e37..952a55961 100644
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import (
     AlpacaPromptTokenizingStrategy,
     InstructionPromptTokenizingStrategy,
 )
-from axolotl.prompters import AlpacaPrompter, PromptStyle
+from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
 
 
 def load(tokenizer, cfg):
@@ -103,3 +103,12 @@ def load_camel_ai(tokenizer, cfg):
         cfg.train_on_inputs,
         cfg.sequence_len,
     )
+
+
+def load_no_prompt(tokenizer, cfg):
+    return AlpacaPromptTokenizingStrategy(
+        UnpromptedPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 5cf3107f3..e9ec641a6 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -124,6 +124,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
     if cfg.max_grad_norm:
         training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
 
+    if cfg.push_to_hub_model_id:
+        training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id
+        training_arguments_kwargs["push_to_hub"] = True
+
     training_args = transformers.TrainingArguments(
         per_device_train_batch_size=cfg.micro_batch_size,
         per_device_eval_batch_size=cfg.eval_batch_size

From 924bbfddecfcd8b9ddfb5d0bad3b89d4a00edaac Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 28 Jun 2023 22:27:17 -0400
Subject: [PATCH 43/55] add option for instruct w sys prompts

---
 src/axolotl/prompt_strategies/alpaca_w_system.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py
index aacae8739..bcdcd9334 100644
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -76,6 +76,19 @@ class SystemDataPrompter(AlpacaPrompter):
 
 
 def load(tokenizer, cfg):
+    return load_chat(tokenizer, cfg)
+
+
+def load_instruct(tokenizer, cfg):
+    return InstructionWSystemPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+def load_chat(tokenizer, cfg):
     return InstructionWSystemPromptTokenizingStrategy(
         SystemDataPrompter(PromptStyle.CHAT.value),
         tokenizer,

From 530809fd7405f2abb1b88ab8d6d3cb78e5e765bb Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 28 Jun 2023 22:36:28 -0400
Subject: [PATCH 44/55] update pip install command for apex

---
 docker/Dockerfile-base | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 2728f3a72..20bd80f70 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -77,7 +77,7 @@ FROM base-builder
 RUN python3 -m pip uninstall -y apex
 RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
-RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
+RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
 
 RUN mkdir -p /workspace/builds
 COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes

From 77bdb7d1444cd0fbd822a1a68fc2db6abbb78814 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 29 Jun 2023 14:29:55 +0900
Subject: [PATCH 45/55] Fix typing list

---
 src/axolotl/prompt_tokenizers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index cf80539eb..8216d73dd 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -440,7 +440,7 @@ def parse_tokenized_to_result(
     result: Dict[str, List[int]],
     current_len: int,
     res: Dict[str, List[int]],
-    labels: list[int],
+    labels: List[int],
     pad_token_id: Union[int, None] = None,
 ) -> Tuple[Dict[str, List[int]], int]:
     """

From c146880a7559d8f6b6553561cd11ad7d1745b6ae Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Fri, 30 Jun 2023 11:33:53 +0900
Subject: [PATCH 46/55] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 047d6aa34..27aec72db 100644
--- a/README.md
+++ b/README.md
@@ -336,6 +336,8 @@ datasets:
 dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
+# push checkpoints to hub
+push_to_hub_model_id: # repo path
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean

From 78a1e1fa12b7b4698328a21e15abbc0958e8babf Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 1 Jul 2023 00:19:41 -0400
Subject: [PATCH 47/55] open orca support

---
 README.md                                     |  4 ++++
 .../prompt_strategies/alpaca_w_system.py      | 23 +++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/README.md b/README.md
index 27aec72db..4929987cb 100644
--- a/README.md
+++ b/README.md
@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
   ```json
   {"message_1": "...", "message_2": "..."}
   ```
+- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
+  ```json
+  {"system_prompt": "...", "question": "...", "response": "..."}
+  ```
 - `context_qa`: in context question answering from an article
   ```json
   {"article": "...", "question": "...", "answer": "..."}
diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py
index aacae8739..1b4f50219 100644
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter):
         yield res
 
 
+class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
+    """
+    Tokenizing strategy for OpenOrca datasets
+    """
+
+    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
+        return (
+            prompt["question"],
+            "",
+            prompt["response"],
+            prompt["system_prompt"],
+        )
+
+
 def load(tokenizer, cfg):
     return InstructionWSystemPromptTokenizingStrategy(
         SystemDataPrompter(PromptStyle.CHAT.value),
@@ -82,3 +96,12 @@ def load(tokenizer, cfg):
         cfg.train_on_inputs,
         cfg.sequence_len,
     )
+
+
+def load_open_orca(tokenizer, cfg):
+    return OpenOrcaPromptTokenizingStrategy(
+        SystemDataPrompter(PromptStyle.INSTRUCT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )

From a10da1caff183cf986975a06f5c7ffc4f300fb22 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 1 Jul 2023 00:29:07 -0400
Subject: [PATCH 48/55] 11.7.0 nvidia/cuda docker images are deprecated, move
 to 11.7.1

---
 .github/workflows/base.yml | 2 +-
 .github/workflows/main.yml | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 623083db2..f3ad69570 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -26,7 +26,7 @@ jobs:
             pytorch: 2.0.0
             axolotl_extras:
           - cuda: "117"
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
             python_version: "3.9"
             pytorch: 1.13.1
             axolotl_extras:
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 033199154..07f25cac6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
             pytorch: 2.0.0
             axolotl_extras: gptq
           - cuda: cu117
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
             python_version: "3.9"
             pytorch: 1.13.1
             axolotl_extras:
@@ -85,7 +85,7 @@ jobs:
             pytorch: 2.0.0
             axolotl_extras: gptq
           - cuda: cu117
-            cuda_version: 11.7.0
+            cuda_version: 11.7.1
             python_version: "3.9"
             pytorch: 1.13.1
             axolotl_extras:

From 71456955f5da8015dacb138ec70b9693d33a037b Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 2 Jul 2023 22:26:51 -0400
Subject: [PATCH 49/55] pin pydantic so deepspeed isn't broken

---
 docker/Dockerfile-base | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 20bd80f70..adf7996ee 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
 RUN git lfs install --skip-repo
 RUN pip3 install awscli && \
     # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic
+    pip3 install -U --no-cache-dir pydantic==1.10.10

From e79c8e617e1584a0fe4cac33c263237178b561ce Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Mon, 3 Jul 2023 12:44:29 +0900
Subject: [PATCH 50/55] Fix future deprecation push_to_hub_model_id

---
 README.md                       | 2 +-
 src/axolotl/utils/trainer.py    | 4 ++--
 src/axolotl/utils/validation.py | 5 +++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 4929987cb..e45ac54b7 100644
--- a/README.md
+++ b/README.md
@@ -341,7 +341,7 @@ dataset_prepared_path: data/last_run_prepared
 # push prepared dataset to hub
 push_dataset_to_hub: # repo path
 # push checkpoints to hub
-push_to_hub_model_id: # repo path
+hub_model_id: # repo path
 # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # required to be true when used in combination with `push_dataset_to_hub`
 hf_use_auth_token: # boolean
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index e9ec641a6..263d6c78d 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -124,8 +124,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
     if cfg.max_grad_norm:
         training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
 
-    if cfg.push_to_hub_model_id:
-        training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id
+    if cfg.hub_model_id:
+        training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
         training_arguments_kwargs["push_to_hub"] = True
 
     training_args = transformers.TrainingArguments(
diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index 2e0da69b3..43b4b1d16 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -92,6 +92,11 @@ def validate_config(cfg):
     ):
         logging.warning("adamw hyperparameters found, but no adamw optimizer set")
 
+    if cfg.push_to_hub_model_id:
+        raise ValueError(
+            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
+        )
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25

From 9e64f42e0fe2f3a5075cf516c8ea0d95837e1ff5 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 6 Jul 2023 23:08:09 +0900
Subject: [PATCH 51/55] Fix local path loading and custom strategy type

---
 README.md | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index e45ac54b7..88e8b28ca 100644
--- a/README.md
+++ b/README.md
@@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts
 
   1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type.
+  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
 
 Optionally, download some datasets, see [data/README.md](data/README.md)
 
@@ -255,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
 
 - dataset
   ```yaml
+  sequence_len: 2048 # max token length for prompt
+  
+  # huggingface repo 
   datasets:
-    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
+    - path: vicgalle/alpaca-gpt4
+      type: alpaca # format from earlier
+
+  # local
+  datasets:
+    - path: json
+      data_files: data.jsonl # or json
       type: alpaca # format from earlier
-  sequence_len: 2048 # max token length / prompt
   ```
 
 - loading
@@ -328,10 +336,10 @@ tf32: true # require >=ampere
 
 # a list of one or more datasets to finetune the model with
 datasets:
-  # this can be either a hf dataset, or relative path
+  # hf dataset repo | "json" for local dataset, make sure to fill data_files
   - path: vicgalle/alpaca-gpt4
   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format OR format:prompt_style (chat/instruct)
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
     data_files: # path to source data files
     shards: # number of shards to split data into
 

From 41da98b9823ee13234321be089d3d761c53b7529 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 6 Jul 2023 23:20:11 +0900
Subject: [PATCH 52/55] Fix for linter

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 88e8b28ca..6b81e69de 100644
--- a/README.md
+++ b/README.md
@@ -256,8 +256,8 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
 - dataset
   ```yaml
   sequence_len: 2048 # max token length for prompt
-  
-  # huggingface repo 
+
+  # huggingface repo
   datasets:
     - path: vicgalle/alpaca-gpt4
       type: alpaca # format from earlier

From 66afb76a15cb0f930baab850e77cc16d0cdfd029 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 7 Jul 2023 21:31:02 -0400
Subject: [PATCH 53/55] don't use llama if trust_remote_code is set since that
 needs to use AutoModel path

---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 6d94cd674..95311ca2b 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -202,7 +202,7 @@ def load_model(
                 else True,
             )
             load_in_8bit = False
-        elif cfg.is_llama_derived_model:
+        elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
             from transformers import LlamaForCausalLM
 
             config = LlamaConfig.from_pretrained(base_model_config)

From d69da99c2c43c035c5ee7a425ad9c85aeef81dfb Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 7 Jul 2023 21:33:11 -0400
Subject: [PATCH 54/55] skip explicit model type too if using trust_remote_code

---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 95311ca2b..7181cca31 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -241,7 +241,7 @@ def load_model(
         #         device=cfg.device,
         #     )
         #     model.train() # sets to train instead of eval mode
-        elif model_type:
+        elif model_type and not cfg.trust_remote_code:
             model = getattr(transformers, model_type).from_pretrained(
                 base_model,
                 load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,

From 19cf0bda99b0957dd4ccd2152d27faa84f6f58a8 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 8 Jul 2023 12:13:39 -0400
Subject: [PATCH 55/55] params are adam_*, not adamw_*

---
 src/axolotl/utils/validation.py |  2 +-
 tests/test_validation.py        | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index 43b4b1d16..40dfb84a9 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -87,7 +87,7 @@ def validate_config(cfg):
             "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
         )
 
-    if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
+    if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
         not cfg.optimizer or "adamw" not in cfg.optimizer
     ):
         logging.warning("adamw hyperparameters found, but no adamw optimizer set")
diff --git a/tests/test_validation.py b/tests/test_validation.py
index d39a4618e..88c97f0b7 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
         cfg = DictDefault(
             {
                 "optimizer": None,
-                "adamw_epsilon": 0.0001,
+                "adam_epsilon": 0.0001,
             }
         )
 
@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
         cfg = DictDefault(
             {
                 "optimizer": "adafactor",
-                "adamw_beta1": 0.0001,
+                "adam_beta1": 0.0001,
             }
         )
 
@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
         cfg = DictDefault(
             {
                 "optimizer": "adamw_bnb_8bit",
-                "adamw_beta1": 0.0001,
-                "adamw_beta2": 0.0001,
-                "adamw_epsilon": 0.0001,
+                "adam_beta1": 0.9,
+                "adam_beta2": 0.99,
+                "adam_epsilon": 0.0001,
             }
         )