From 193c73bce040fe965f5ea66d235e8823bd19e5e7 Mon Sep 17 00:00:00 2001
From: Angainor Development <54739135+AngainorDev@users.noreply.github.com>
Date: Thu, 8 Jun 2023 09:18:58 +0200
Subject: [PATCH 01/35] Fix training over existing lora

When training with Lora, and starting with an existing lora weights, current code produces a model with 0 trainable params and training can't work.
Adding the "is_trainable" param allows the loaded peft to be trained and fixes the bug.
---
 src/axolotl/utils/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 58e0e97ec..b5d5124cb 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -402,6 +402,7 @@ def load_lora(model, cfg):
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
+            is_trainable=True,
             device_map=cfg.device_map,
             # torch_dtype=torch.float16,
         )

From 813cfa4c14f990c53ed42e9decd84b3e41a91102 Mon Sep 17 00:00:00 2001
From: Angainor Development <54739135+AngainorDev@users.noreply.github.com>
Date: Fri, 9 Jun 2023 08:49:32 +0200
Subject: [PATCH 02/35] WIP: Rely on cfg.inference

---
 src/axolotl/utils/models.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index b5d5124cb..c3f988e52 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -80,8 +80,7 @@ def load_model(
     model_type,
     tokenizer,
     cfg,
-    adapter="lora",
-    inference=False,
+    adapter="lora"
 ):
     # type: (str, str, str, str, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
     """
@@ -95,7 +94,7 @@ def load_model(
     )
 
     if is_llama_derived_model and cfg.flash_attention:
-        if cfg.device not in ["mps", "cpu"] and inference is False:
+        if cfg.device not in ["mps", "cpu"] and cfg.inference is False:
             from axolotl.flash_attn import replace_llama_attn_with_flash_attn
 
             logging.info("patching with flash attention")
@@ -402,7 +401,7 @@ def load_lora(model, cfg):
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
-            is_trainable=True,
+            is_trainable=not cfg.inference,
             device_map=cfg.device_map,
             # torch_dtype=torch.float16,
         )

From bd3b53734459e0ace9795579150a3eff0ff4eaeb Mon Sep 17 00:00:00 2001
From: Angainor Development <54739135+AngainorDev@users.noreply.github.com>
Date: Fri, 9 Jun 2023 08:59:05 +0200
Subject: [PATCH 03/35] Feed cfg.inference

---
 scripts/finetune.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 7c4d865fa..ab8f068aa 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -182,6 +182,9 @@ def train(
         if cfg.bf16:
             cfg.fp16 = True
         cfg.bf16 = False
+        
+    # Store inference mode into cfg when passed via args
+    cfg.inference = True if "inference" in kwargs else cfg.get("inference", False)
 
     # load the tokenizer first
     tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
@@ -189,8 +192,8 @@ def train(
     tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
 
     if check_not_in(
-        ["inference", "shard", "merge_lora"], kwargs
-    ):  # don't need to load dataset for these
+        ["shard", "merge_lora"], kwargs
+    ) and not cfg.inference:  # don't need to load dataset for these
         train_dataset, eval_dataset = load_prepare_datasets(
             tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
         )
@@ -216,8 +219,7 @@ def train(
         cfg.model_type,
         tokenizer,
         cfg,
-        adapter=cfg.adapter,
-        inference=("inference" in kwargs),
+        adapter=cfg.adapter
     )
 
     if "merge_lora" in kwargs and cfg.adapter is not None:
@@ -230,7 +232,7 @@ def train(
             model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
         return
 
-    if "inference" in kwargs:
+    if cfg.inference:
         logging.info("calling do_inference function")
         do_inference(cfg, model, tokenizer)
         return

From c2508987a6354084bf8bbf328c8eccc81e3a9814 Mon Sep 17 00:00:00 2001
From: Angainor Development <54739135+AngainorDev@users.noreply.github.com>
Date: Sat, 10 Jun 2023 19:06:10 +0200
Subject: [PATCH 04/35] Remove explicit definition of cfg.inference

---
 scripts/finetune.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index ab8f068aa..faf1bb31d 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -182,9 +182,6 @@ def train(
         if cfg.bf16:
             cfg.fp16 = True
         cfg.bf16 = False
-        
-    # Store inference mode into cfg when passed via args
-    cfg.inference = True if "inference" in kwargs else cfg.get("inference", False)
 
     # load the tokenizer first
     tokenizer_config = cfg.tokenizer_config or cfg.base_model_config

From 14163c15d9d52e6fd8674eeb865f689735e42b67 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 10 Jun 2023 14:11:13 -0400
Subject: [PATCH 05/35] fix for local variable 'LlamaForCausalLM' referenced
 before assignment

---
 src/axolotl/utils/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 8ceaa0d53..4a3b4c55a 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -90,6 +90,7 @@ def load_model(
     Load a model from a base model and a model type.
     """
 
+    global LlamaForCausalLM  # pylint: disable=global-statement
     # TODO refactor as a kwarg
     load_in_8bit = cfg.load_in_8bit
     cfg.is_llama_derived_model = "llama" in base_model or (

From a808bf913f79fa29596e283a0bb70954caac0645 Mon Sep 17 00:00:00 2001
From: Angainor Development <54739135+AngainorDev@users.noreply.github.com>
Date: Sat, 10 Jun 2023 20:28:49 +0200
Subject: [PATCH 06/35] Fix missing cfg.

---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 7156adec0..67facd607 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -96,7 +96,7 @@ def load_model(
     )
 
     if cfg.is_llama_derived_model and cfg.flash_attention:
-        if cfg.device not in ["mps", "cpu"] and inference is False:
+        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
             from axolotl.flash_attn import replace_llama_attn_with_flash_attn
 
             logging.info("patching with flash attention")

From c4e4f8115c9deaf4f831afbfbeaf0d0fe2eac7b8 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 10 Jun 2023 15:07:40 -0400
Subject: [PATCH 07/35] pass a prompt in from stdin for inference

---
 README.md           |  5 +++++
 scripts/finetune.py | 23 ++++++++++++++++++-----
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index de929f237..180d97932 100644
--- a/README.md
+++ b/README.md
@@ -495,6 +495,11 @@ Pass the appropriate flag to the train command:
   ```bash
   --inference --base_model ./completed-model
   ```
+- Full weights finetune w/ a prompt from a text file:
+  ```bash
+  cat /tmp/prompt.txt | python scripts/finetune.py configs/your_config.yml \
+    --base_model ./completed-model --inference --prompter=None --load_in_8bit=True
+  ```
 
 ### Merge LORA to base
 
diff --git a/scripts/finetune.py b/scripts/finetune.py
index fa2dcf903..8a458890c 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -71,7 +71,11 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
         if not (cfg.special_tokens and token in cfg.special_tokens):
             tokenizer.add_special_tokens({token: symbol})
 
-    prompter_module = getattr(importlib.import_module("axolotl.prompters"), prompter)
+    prompter_module = None
+    if prompter:
+        prompter_module = getattr(
+            importlib.import_module("axolotl.prompters"), prompter
+        )
 
     while True:
         print("=" * 80)
@@ -79,9 +83,12 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
         instruction = get_multi_line_input()
         if not instruction:
             return
-        prompt: str = next(
-            prompter_module().build_prompt(instruction=instruction.strip("\n"))
-        )
+        if prompter_module:
+            prompt: str = next(
+                prompter_module().build_prompt(instruction=instruction.strip("\n"))
+            )
+        else:
+            prompt = instruction.strip()
         batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
         print("=" * 40)
         model.eval()
@@ -242,7 +249,13 @@ def train(
 
     if "inference" in kwargs:
         logging.info("calling do_inference function")
-        do_inference(cfg, model, tokenizer)
+        inf_kwargs: Dict[str, Any] = {}
+        if "prompter" in kwargs:
+            if kwargs["prompter"] == "None":
+                inf_kwargs["prompter"] = None
+            else:
+                inf_kwargs["prompter"] = kwargs["prompter"]
+        do_inference(cfg, model, tokenizer, **inf_kwargs)
         return
 
     if "shard" in kwargs:

From 5ffefee37f351c827759394057a295aa99b7d64f Mon Sep 17 00:00:00 2001
From: Akshay Jain <jainakshay1783@gmail.com>
Date: Sat, 10 Jun 2023 18:34:54 -0700
Subject: [PATCH 08/35] Update FAQS.md

Update FAQS.md with the following statement

Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c
/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.  This could lead to a segmentation fault at exit

try reinstalling bitsandbytes and transformers from source
---
 FAQS.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/FAQS.md b/FAQS.md
index bdf056be7..906cf2bef 100644
--- a/FAQS.md
+++ b/FAQS.md
@@ -2,3 +2,6 @@
 
 - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
 - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
+- Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c
+/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.  This could lead to a segmentation fault at exit. 
+<br>Try reinstalling bitsandbytes and transformers from source</br>

From 919727b4d7f5540db430135641d4b6ad2cbc729f Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Sat, 10 Jun 2023 08:09:29 +0900
Subject: [PATCH 09/35] Refactor landmark attention patch

---
 .../monkeypatch/llama_landmark_attn.py        |  9 ++++++++
 src/axolotl/utils/models.py                   | 23 +++++++++----------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/src/axolotl/monkeypatch/llama_landmark_attn.py b/src/axolotl/monkeypatch/llama_landmark_attn.py
index 18e913f09..1a130f755 100644
--- a/src/axolotl/monkeypatch/llama_landmark_attn.py
+++ b/src/axolotl/monkeypatch/llama_landmark_attn.py
@@ -1593,3 +1593,12 @@ def add_mem_tokens(example, mem_freq, mem_id):
     ret.extend(x[prev_idx:])
     # drop attention_mask
     return {"input_ids": ret}
+
+
+def patch_llama_with_landmark_attn():
+    import transformers
+
+    transformers.models.llama.modeling_llama.LlamaForCausalLM = LlamaForCausalLM
+    transformers.models.llama.modeling_llama.LlamaModel = LlamaModel
+    transformers.models.llama.modeling_llama.LlamaAttention = LlamaAttention
+    transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index fb363952c..b84597076 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -19,15 +19,6 @@ from transformers import (  # noqa: F401
     LlamaConfig,
 )
 
-try:
-    from transformers import (  # pylint: disable=unused-import  # noqa: F401
-        LlamaForCausalLM,
-    )
-except ImportError:
-    logging.warning(
-        "This version of transformers does not support Llama. Consider upgrading."
-    )
-
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
 
 if TYPE_CHECKING:
@@ -118,14 +109,15 @@ def load_model(
         logging.info("patching with sdp attention")
         hijack_llama_sdp_attention()
     elif cfg.is_llama_derived_model and cfg.landmark_attention:
-        from axolotl.monkeypatch.llama_landmark_attn import (  # pylint: disable=redefined-outer-name # noqa: F811
+        from axolotl.monkeypatch.llama_landmark_attn import (
             MEM_TOKEN,
-            LlamaForCausalLM,
+            patch_llama_with_landmark_attn,
         )
 
         logging.info("patching with landmark attention")
+        patch_llama_with_landmark_attn()
 
-        # TODO: Check if this would overwrite previous additional_special_tokens
+        # Note: This might overwrite previous additional_special_tokens
         tokenizer.add_special_tokens({"additional_special_tokens": [MEM_TOKEN]})
 
     if cfg.is_llama_derived_model and cfg.xpos_rope:
@@ -211,6 +203,13 @@ def load_model(
             )
             load_in_8bit = False
         elif cfg.is_llama_derived_model and "LlamaForCausalLM" in globals():
+            try:
+                from transformers import LlamaForCausalLM
+            except ImportError:
+                logging.warning(
+                    "This version of transformers does not support Llama. Consider upgrading."
+                )
+
             config = LlamaConfig.from_pretrained(base_model_config)
             model = LlamaForCausalLM.from_pretrained(
                 base_model,

From e285e24f7f4e1718c0f26c93eb7eec8e669ae97e Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Sun, 11 Jun 2023 10:52:12 +0900
Subject: [PATCH 10/35] Address PR suggestion

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 src/axolotl/utils/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index b84597076..b3a5eeb60 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -202,7 +202,7 @@ def load_model(
                 else True,
             )
             load_in_8bit = False
-        elif cfg.is_llama_derived_model and "LlamaForCausalLM" in globals():
+        elif cfg.is_llama_derived_model:
             try:
                 from transformers import LlamaForCausalLM
             except ImportError:

From dd7d16d2ebee1ef21dc8061e8f271ae1d4329858 Mon Sep 17 00:00:00 2001
From: Akshay Jain <jainakshay1783@gmail.com>
Date: Sat, 10 Jun 2023 19:15:50 -0700
Subject: [PATCH 11/35] Update FAQS.md

Updated FAQS.md with backticks around error message
---
 FAQS.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/FAQS.md b/FAQS.md
index 906cf2bef..28183abc9 100644
--- a/FAQS.md
+++ b/FAQS.md
@@ -2,6 +2,6 @@
 
 - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
 - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
-- Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c
-/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.  This could lead to a segmentation fault at exit. 
-<br>Try reinstalling bitsandbytes and transformers from source</br>
+- ```Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c```
+```/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.```  
+This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.

From 0e664a5ebcb6c09a24cadbc567be5eae5e7bdb42 Mon Sep 17 00:00:00 2001
From: Akshay Jain <jainakshay1783@gmail.com>
Date: Sat, 10 Jun 2023 19:26:12 -0700
Subject: [PATCH 12/35] Update FAQS.md

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
---
 FAQS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FAQS.md b/FAQS.md
index 28183abc9..29222992a 100644
--- a/FAQS.md
+++ b/FAQS.md
@@ -3,5 +3,5 @@
 - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
 - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
 - ```Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c```
-```/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.```  
+`/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.`
 This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.

From cd0a6f60277063aa2b928dcd6a47e2a73a2a6610 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 10 Jun 2023 22:50:09 -0400
Subject: [PATCH 13/35] peft no longer needs device_map

---
 src/axolotl/utils/models.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index fb363952c..995f8e8f1 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -375,7 +375,6 @@ def load_llama_adapter(model, cfg):
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
-            device_map=cfg.device_map,
             torch_dtype=torch.float16,
         )
     else:
@@ -437,8 +436,6 @@ def load_lora(model, cfg):
         model = PeftModel.from_pretrained(
             model,
             cfg.lora_model_dir,
-            device_map=cfg.device_map,
-            # torch_dtype=torch.float16,
         )
     else:
         model = get_peft_model(model, lora_config)

From 563b6d89e6ee7fc104204ce7cd178b56c35b633d Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Sun, 11 Jun 2023 11:58:31 +0900
Subject: [PATCH 14/35] Fix undefined LlamaForCausalLM and del try except

---
 src/axolotl/utils/models.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index b3a5eeb60..43d4a6a9c 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -81,7 +81,6 @@ def load_model(
     Load a model from a base model and a model type.
     """
 
-    global LlamaForCausalLM  # pylint: disable=global-statement
     # TODO refactor as a kwarg
     load_in_8bit = cfg.load_in_8bit
     cfg.is_llama_derived_model = "llama" in base_model or (
@@ -203,12 +202,7 @@ def load_model(
             )
             load_in_8bit = False
         elif cfg.is_llama_derived_model:
-            try:
-                from transformers import LlamaForCausalLM
-            except ImportError:
-                logging.warning(
-                    "This version of transformers does not support Llama. Consider upgrading."
-                )
+            from transformers import LlamaForCausalLM
 
             config = LlamaConfig.from_pretrained(base_model_config)
             model = LlamaForCausalLM.from_pretrained(

From a6190c8094ead6d31570d597b6f2dfdbe83f50ff Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Sun, 11 Jun 2023 11:59:03 +0900
Subject: [PATCH 15/35] Clean up landmark patching

---
 .../monkeypatch/llama_landmark_attn.py        | 439 ++----------------
 1 file changed, 37 insertions(+), 402 deletions(-)

diff --git a/src/axolotl/monkeypatch/llama_landmark_attn.py b/src/axolotl/monkeypatch/llama_landmark_attn.py
index 1a130f755..51f1b90fe 100644
--- a/src/axolotl/monkeypatch/llama_landmark_attn.py
+++ b/src/axolotl/monkeypatch/llama_landmark_attn.py
@@ -28,15 +28,23 @@ from typing import List, Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
+from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
-    SequenceClassifierOutputWithPast,
 )
-from transformers.modeling_utils import PreTrainedModel
 from transformers.models.llama.configuration_llama import LlamaConfig
+from transformers.models.llama.modeling_llama import (
+    LLAMA_INPUTS_DOCSTRING,
+    LLAMA_START_DOCSTRING,
+    LlamaMLP,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    _expand_mask,
+    _make_causal_mask,
+    rotate_half,
+)
 from transformers.utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
@@ -51,131 +59,6 @@ _CONFIG_FOR_DOC = "LlamaConfig"
 MEM_TOKEN = "<landmark>"  # nosec
 
 
-# Copied from transformers.models.bart.modeling_bart._make_causal_mask
-def _make_causal_mask(
-    input_ids_shape: torch.Size,
-    dtype: torch.dtype,
-    device: torch.device,
-    past_key_values_length: int = 0,
-):
-    """
-    Make causal mask used for bi-directional self-attention.
-    """
-    bsz, tgt_len = input_ids_shape
-    mask = torch.full(
-        (tgt_len, tgt_len),
-        torch.tensor(torch.finfo(dtype).min, device=device),
-        device=device,
-    )
-    mask_cond = torch.arange(mask.size(-1), device=device)
-    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
-    mask = mask.to(dtype)
-
-    if past_key_values_length > 0:
-        mask = torch.cat(
-            [
-                torch.zeros(
-                    tgt_len, past_key_values_length, dtype=dtype, device=device
-                ),
-                mask,
-            ],
-            dim=-1,
-        )
-    return mask[None, None, :, :].expand(
-        bsz, 1, tgt_len, tgt_len + past_key_values_length
-    )
-
-
-# Copied from transformers.models.bart.modeling_bart._expand_mask
-def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
-    """
-    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
-    """
-    bsz, src_len = mask.size()
-    tgt_len = tgt_len if tgt_len is not None else src_len
-
-    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
-
-    inverted_mask = 1.0 - expanded_mask
-
-    return inverted_mask.masked_fill(
-        inverted_mask.to(torch.bool), torch.finfo(dtype).min
-    )
-
-
-class LlamaRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        LlamaRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-
-        # convert into half-precision if necessary
-        if self.weight.dtype in [torch.float16, torch.bfloat16]:
-            hidden_states = hidden_states.to(self.weight.dtype)
-
-        return self.weight * hidden_states
-
-
-class LlamaRotaryEmbedding(torch.nn.Module):
-    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
-        super().__init__()
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim))
-        self.register_buffer("inv_freq", inv_freq)
-
-        # Build here to make `torch.jit.trace` work.
-        self.max_seq_len_cached = max_position_embeddings
-        t = torch.arange(
-            self.max_seq_len_cached,
-            device=self.inv_freq.device,
-            dtype=self.inv_freq.dtype,
-        )
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer(
-            "cos_cached", emb.cos()[None, None, :, :], persistent=False
-        )
-        self.register_buffer(
-            "sin_cached", emb.sin()[None, None, :, :], persistent=False
-        )
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case.
-        if seq_len > self.max_seq_len_cached:
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(
-                self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype
-            )
-            freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            self.register_buffer(
-                "cos_cached", emb.cos()[None, None, :, :], persistent=False
-            )
-            self.register_buffer(
-                "sin_cached", emb.sin()[None, None, :, :], persistent=False
-            )
-        return (
-            self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-            self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype),
-        )
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
@@ -190,24 +73,11 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     return q_embed, k_embed
 
 
-class LlamaMLP(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        intermediate_size: int,
-        hidden_act: str,
-    ):
-        super().__init__()
-        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
-        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
-        self.act_fn = ACT2FN[hidden_act]
-
-    def forward(self, x):
-        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-
-
 class LandmarkGroupedSoftmaxFunction(torch.autograd.Function):
+    """
+    Landmark grouped softmax function.
+    """
+
     # Note that forward, setup_context, and backward are @staticmethods
     @staticmethod
     def forward(ctx, x, dim, mem_cnt, resp_mem_idx):
@@ -682,16 +552,14 @@ class LlamaAttention(nn.Module):
         # upcast attention to fp32
         if is_mem is None:
             raise ValueError("Don't use this without landmarks")
-            # attn_weights = nn.functional.softmax(
-            #     attn_weights, dim=-1, dtype=torch.float32
-            # ).to(query_states.dtype)
-        else:
-            attn_weights = landmark_grouped_softmax(
-                attn_weights,
-                dim=-1,
-                is_mem=is_mem.expand(-1, self.num_heads, -1, -1),
-                last_section_mask=last_section_mask,
-            ).to(query_states.dtype)
+
+        attn_weights = landmark_grouped_softmax(
+            attn_weights,
+            dim=-1,
+            is_mem=is_mem.expand(-1, self.num_heads, -1, -1),
+            last_section_mask=last_section_mask,
+        ).to(query_states.dtype)
+
         if attn_prefix is not None:
             attn_prefix, attn_weights = torch.split(
                 attn_weights,
@@ -722,6 +590,10 @@ class LlamaAttention(nn.Module):
 
 
 class LlamaDecoderLayer(nn.Module):
+    """
+    Llama Decoder layer
+    """
+
     def __init__(self, config: LlamaConfig):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -802,114 +674,6 @@ class LlamaDecoderLayer(nn.Module):
         return outputs
 
 
-LLAMA_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`LlamaConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
-    LLAMA_START_DOCSTRING,
-)
-class LlamaPreTrainedModel(PreTrainedModel):
-    config_class = LlamaConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["LlamaDecoderLayer"]
-    _keys_to_ignore_on_load_unexpected = [r"decoder\.version"]
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LlamaModel):
-            module.gradient_checkpointing = value
-
-
-LLAMA_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
 @add_start_docstrings(
     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
     LLAMA_START_DOCSTRING,
@@ -1178,6 +942,10 @@ class LlamaModel(LlamaPreTrainedModel):
 
 
 class LlamaForCausalLM(LlamaPreTrainedModel):
+    """
+    Llama model with a causal language modeling head.
+    """
+
     def __init__(self, config):
         super().__init__(config)
         self.model = LlamaModel(config)
@@ -1448,149 +1216,15 @@ class LlamaForCausalLM(LlamaPreTrainedModel):
         return reordered_past
 
 
-@add_start_docstrings(
-    """
-    The LLaMa Model transformer with a sequence classification head on top (linear layer).
-
-    [`LlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
-    (e.g. GPT-2) do.
-
-    Since it does classification on the last token, it requires to know the position of the last token. If a
-    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
-    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
-    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
-    each row of the batch).
-    """,
-    LLAMA_START_DOCSTRING,
-)
-class LlamaForSequenceClassification(LlamaPreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.model = LlamaModel(config)
-        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.model.embed_tokens = value
-
-    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        transformer_outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = transformer_outputs[0]
-        logits = self.score(hidden_states)
-
-        if input_ids is not None:
-            batch_size = input_ids.shape[0]
-        else:
-            batch_size = inputs_embeds.shape[0]
-
-        if self.config.pad_token_id is None and batch_size != 1:
-            raise ValueError(
-                "Cannot handle batch sizes > 1 if no padding token is defined."
-            )
-        if self.config.pad_token_id is None:
-            sequence_lengths = -1
-        else:
-            if input_ids is not None:
-                sequence_lengths = (
-                    torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
-                ).to(logits.device)
-            else:
-                sequence_lengths = -1
-
-        pooled_logits = logits[
-            torch.arange(batch_size, device=logits.device), sequence_lengths
-        ]
-
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (
-                    labels.dtype == torch.long or labels.dtype == torch.int
-                ):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(
-                    pooled_logits.view(-1, self.num_labels), labels.view(-1)
-                )
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
-        if not return_dict:
-            output = (pooled_logits,) + transformer_outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return SequenceClassifierOutputWithPast(
-            loss=loss,
-            logits=pooled_logits,
-            past_key_values=transformer_outputs.past_key_values,
-            hidden_states=transformer_outputs.hidden_states,
-            attentions=transformer_outputs.attentions,
-        )
-
-
 def add_mem_tokens(example, mem_freq, mem_id):
-    x = example["input_ids"]
+    ids = example["input_ids"]
     ret = []
     prev_idx = 0
-    for t_idx in range(mem_freq, len(x), mem_freq):
-        ret.extend(x[prev_idx:t_idx])
+    for t_idx in range(mem_freq, len(ids), mem_freq):
+        ret.extend(ids[prev_idx:t_idx])
         ret.append(mem_id)
         prev_idx = t_idx
-    ret.extend(x[prev_idx:])
+    ret.extend(ids[prev_idx:])
     # drop attention_mask
     return {"input_ids": ret}
 
@@ -1602,3 +1236,4 @@ def patch_llama_with_landmark_attn():
     transformers.models.llama.modeling_llama.LlamaModel = LlamaModel
     transformers.models.llama.modeling_llama.LlamaAttention = LlamaAttention
     transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
+    transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb

From 572d1141e67789dfdb78dca35b6be6945d4fb782 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Sun, 11 Jun 2023 12:05:37 +0900
Subject: [PATCH 16/35] Set mem cache args on inference

---
 scripts/finetune.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 8a458890c..cdc4b5e0e 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -77,6 +77,11 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
             importlib.import_module("axolotl.prompters"), prompter
         )
 
+    if cfg.landmark_attention:
+        model.set_mem_cache_args(
+            max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
+        )
+
     while True:
         print("=" * 80)
         # support for multiline inputs
@@ -90,6 +95,7 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
         else:
             prompt = instruction.strip()
         batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+
         print("=" * 40)
         model.eval()
         with torch.no_grad():

From 974dc00a7d966e1b26c7e69aea378c8d325776c8 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Sun, 11 Jun 2023 14:00:54 +0900
Subject: [PATCH 17/35] Fix set mem_id for inference and refactor

---
 scripts/finetune.py                            |  3 +++
 src/axolotl/monkeypatch/llama_landmark_attn.py | 10 ++++++++++
 src/axolotl/utils/trainer.py                   | 11 +++++++----
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index cdc4b5e0e..4875256ba 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -78,6 +78,9 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
         )
 
     if cfg.landmark_attention:
+        from axolotl.monkeypatch.llama_landmark_attn import set_model_mem_id
+
+        set_model_mem_id(model, tokenizer)
         model.set_mem_cache_args(
             max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
         )
diff --git a/src/axolotl/monkeypatch/llama_landmark_attn.py b/src/axolotl/monkeypatch/llama_landmark_attn.py
index 51f1b90fe..2a4cdbc36 100644
--- a/src/axolotl/monkeypatch/llama_landmark_attn.py
+++ b/src/axolotl/monkeypatch/llama_landmark_attn.py
@@ -29,6 +29,7 @@ import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
+from transformers import LlamaTokenizer
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
@@ -1237,3 +1238,12 @@ def patch_llama_with_landmark_attn():
     transformers.models.llama.modeling_llama.LlamaAttention = LlamaAttention
     transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
     transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb
+
+
+def set_model_mem_id(model: LlamaForCausalLM, tokenizer: LlamaTokenizer):
+    mem_id = tokenizer.convert_tokens_to_ids(MEM_TOKEN)
+    model.set_mem_id(mem_id)
+
+
+def get_mem_id(tokenizer: LlamaTokenizer):
+    return tokenizer.convert_tokens_to_ids(MEM_TOKEN)
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 9ae1e7e93..1250ad4f6 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -239,16 +239,19 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
     if cfg.is_llama_derived_model and cfg.landmark_attention:
         from functools import partial
 
-        from axolotl.monkeypatch.llama_landmark_attn import MEM_TOKEN, add_mem_tokens
+        from axolotl.monkeypatch.llama_landmark_attn import (
+            add_mem_tokens,
+            get_mem_id,
+            set_model_mem_id,
+        )
 
-        mem_id = tokenizer.convert_tokens_to_ids(MEM_TOKEN)
-        model.set_mem_id(mem_id)
+        set_model_mem_id(model, tokenizer)
 
         logging.info("Adding landmark attention tokens to dataset")
 
         for dataset in [train_dataset, eval_dataset]:
             dataset = dataset.map(
-                partial(add_mem_tokens, mem_freq=50, mem_id=mem_id),
+                partial(add_mem_tokens, mem_freq=50, mem_id=get_mem_id(tokenizer)),
                 batched=False,
                 num_proc=32,
             )

From e3e7b52a5b0549fc6abb75ab6d3aa377d9478924 Mon Sep 17 00:00:00 2001
From: Akshay Jain <jainakshay1783@gmail.com>
Date: Sat, 10 Jun 2023 23:36:14 -0700
Subject: [PATCH 18/35] Update FAQS.md

Converted (```) to single backtick (') uniformly.
---
 FAQS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/FAQS.md b/FAQS.md
index 29222992a..f3c9dd525 100644
--- a/FAQS.md
+++ b/FAQS.md
@@ -2,6 +2,6 @@
 
 - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
 - Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
-- ```Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c```
+- `Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c`
 `/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.`
 This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.

From fe0b76854ec444643481da131228c8d214654f91 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 09:20:40 -0400
Subject: [PATCH 19/35] match up gradient checkpointing when using lora w
 config

---
 src/axolotl/utils/models.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index fb363952c..b79f116fa 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -305,7 +305,9 @@ def load_model(
         or (cfg.adapter == "qlora" and cfg.load_in_4bit)
     ):
         logging.info("converting PEFT model w/ prepare_model_for_kbit_training")
-        model = prepare_model_for_kbit_training(model)
+        model = prepare_model_for_kbit_training(
+            model, use_gradient_checkpointing=cfg.gradient_checkpointing
+        )
 
     model, lora_config = load_adapter(model, cfg, adapter)
 

From b565ecf0a1d6bcecbcfa7366dc2ca04983ca0523 Mon Sep 17 00:00:00 2001
From: AngainorDev <dev@angainor.com>
Date: Sun, 11 Jun 2023 15:23:38 +0200
Subject: [PATCH 20/35] Fix strict and Lint

---
 scripts/finetune.py         | 10 +++++-----
 src/axolotl/utils/models.py |  9 ++-------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/scripts/finetune.py b/scripts/finetune.py
index 3222afd81..49bd505ce 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -158,7 +158,7 @@ def train(
     cfg_keys = cfg.keys()
     for k, _ in kwargs.items():
         # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or cfg.strict is False:
+        if k in cfg_keys or not cfg.strict:
             # handle booleans
             if isinstance(cfg[k], bool):
                 cfg[k] = bool(kwargs[k])
@@ -198,9 +198,9 @@ def train(
     logging.info(f"loading tokenizer... {tokenizer_config}")
     tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
 
-    if check_not_in(
-        ["shard", "merge_lora"], kwargs
-    ) and not cfg.inference:  # don't need to load dataset for these
+    if (
+        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
+    ):  # don't need to load dataset for these
         train_dataset, eval_dataset = load_prepare_datasets(
             tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
         )
@@ -226,7 +226,7 @@ def train(
         cfg.model_type,
         tokenizer,
         cfg,
-        adapter=cfg.adapter
+        adapter=cfg.adapter,
     )
 
     if "merge_lora" in kwargs and cfg.adapter is not None:
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 67facd607..3a87392fc 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -77,14 +77,9 @@ def load_tokenizer(
 
 
 def load_model(
-    base_model,
-    base_model_config,
-    model_type,
-    tokenizer,
-    cfg,
-    adapter="lora"
+    base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora"
 ):
-    # type: (str, str, str, AutoTokenizer, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+    # type: (str, str, str, AutoTokenizer, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
     """
     Load a model from a base model and a model type.
     """

From 14668fa54ec8c35771d50ff7956cbb6541e81f6a Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 09:26:10 -0400
Subject: [PATCH 21/35] new validation for mpt w grad checkpoints

---
 src/axolotl/utils/validation.py |  5 +++++
 tests/test_validation.py        | 14 ++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
index 04ffc4c1b..e2d0b34b1 100644
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -57,6 +57,11 @@ def validate_config(cfg):
     if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
         raise ValueError("FSDP is not supported for falcon models")
 
+    if (
+        cfg.base_model and "mpt" in cfg.base_model.lower()
+    ) and cfg.gradient_checkpointing:
+        raise ValueError("gradient_checkpointing is not supported for MPT models")
+
     # TODO
     # MPT 7b
     # https://github.com/facebookresearch/bitsandbytes/issues/25
diff --git a/tests/test_validation.py b/tests/test_validation.py
index 50bdf37e6..e28891060 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -198,3 +198,17 @@ class ValidationTest(unittest.TestCase):
         )
 
         validate_config(cfg)
+
+    def test_mpt_gradient_checkpointing(self):
+        regex_exp = r".*gradient_checkpointing is not supported for MPT models*"
+
+        # Check for lower-case
+        cfg = DictDefault(
+            {
+                "base_model": "mosaicml/mpt-7b",
+                "gradient_checkpointing": True,
+            }
+        )
+
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)

From 77762a5d6b5ee73d5acbbb2e94508c23183f7391 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 09:41:41 -0400
Subject: [PATCH 22/35] get rid of some configs, formalize pythioa lora config

---
 configs/accelerate/default_config.yaml        | 15 -------
 configs/llama_13B_alpaca.yml                  | 39 ----------------
 configs/llama_65B_alpaca.yml                  | 44 ------------------
 configs/llama_7B_4bit.yml                     | 45 -------------------
 configs/quickstart.yml                        | 45 -------------------
 configs/vicuna_13B_4bit_reflect.yml           | 45 -------------------
 .../pythia/lora.yml                           | 26 +++++------
 7 files changed, 11 insertions(+), 248 deletions(-)
 delete mode 100644 configs/accelerate/default_config.yaml
 delete mode 100644 configs/llama_13B_alpaca.yml
 delete mode 100644 configs/llama_65B_alpaca.yml
 delete mode 100644 configs/llama_7B_4bit.yml
 delete mode 100644 configs/quickstart.yml
 delete mode 100644 configs/vicuna_13B_4bit_reflect.yml
 rename configs/pythia_1_2B_alpaca.yml => examples/pythia/lora.yml (56%)

diff --git a/configs/accelerate/default_config.yaml b/configs/accelerate/default_config.yaml
deleted file mode 100644
index 9759703af..000000000
--- a/configs/accelerate/default_config.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-compute_environment: LOCAL_MACHINE
-distributed_type: 'NO'
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 1
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
diff --git a/configs/llama_13B_alpaca.yml b/configs/llama_13B_alpaca.yml
deleted file mode 100644
index 99c9883fe..000000000
--- a/configs/llama_13B_alpaca.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-base_model: huggyllama/llama-13b
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
-    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
-    type: sharegpt
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.002
-adapter:
-lora_model_dir:
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./llama-13b-sharegpt
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-warmup_steps: 1000
-save_steps:
-eval_steps:
-num_epochs: 5
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience: 5
-resume_from_checkpoint:
-local_rank:
diff --git a/configs/llama_65B_alpaca.yml b/configs/llama_65B_alpaca.yml
deleted file mode 100644
index e7d2c211c..000000000
--- a/configs/llama_65B_alpaca.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-base_model: huggyllama/llama-65b
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: data/alpaca_data_gpt4.jsonl
-    type: alpaca
-  - path: anon8231489123/ShareGPT_Vicuna_unfiltered
-    data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: llama-65b-lora
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-llama-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-warmup_steps: 1000
-save_steps:
-num_epochs: 5
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
diff --git a/configs/llama_7B_4bit.yml b/configs/llama_7B_4bit.yml
deleted file mode 100644
index a7451516c..000000000
--- a/configs/llama_7B_4bit.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-base_model: decapoda-research/llama-7b-hf-int4
-base_model_config: decapoda-research/llama-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: tatsu-lab/alpaca  # original alpaca dataset
-    type: alpaca
-dataset_prepared_path: data/last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 1024
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-test
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-num_epochs: 3
-warmup_steps: 100
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-gradient_checkpointing: false
-early_stopping_patience: 3
-resume_from_checkpoint:
-auto_resume_from_checkpoints: true
-local_rank:
-load_4bit: true
-xformers_attention: true
-flash_attention:
diff --git a/configs/quickstart.yml b/configs/quickstart.yml
deleted file mode 100644
index 2362916fc..000000000
--- a/configs/quickstart.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-base_model: decapoda-research/llama-7b-hf-int4
-base_model_config: decapoda-research/llama-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: tatsu-lab/alpaca  # original alpaca dataset
-    type: alpaca
-dataset_prepared_path: data/last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 1024
-max_packed_sequence_len: 1024
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-test
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 3
-warmup_steps: 100
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-gradient_checkpointing: false
-early_stopping_patience: 3
-resume_from_checkpoint:
-auto_resume_from_checkpoints: true
-local_rank:
-gptq: true
-xformers_attention: true
-flash_attention:
diff --git a/configs/vicuna_13B_4bit_reflect.yml b/configs/vicuna_13B_4bit_reflect.yml
deleted file mode 100644
index 3e37f5334..000000000
--- a/configs/vicuna_13B_4bit_reflect.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-base_model: anon8231489123/vicuna-13b-GPTQ-4bit-128g
-base_model_config: anon8231489123/vicuna-13b-GPTQ-4bit-128g
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: false
-load_4bit: true
-gptq_groupsize: 128
-gptq_model_v1: false
-datasets:
-# https://github.com/vaguenebula/AlpacaDataReflect/blob/main/alpaca_reflect_pruned.json
-  - path: data/alpaca_reflect_pruned.jsonl
-    type: reflection
-dataset_prepared_path: data/last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-reflect
-gradient_accumulation_steps: 1
-micro_batch_size: 2
-num_epochs: 3
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-gradient_checkpointing: false
-early_stopping_patience: 3
-resume_from_checkpoint:
-local_rank:
-flash_attention: true
diff --git a/configs/pythia_1_2B_alpaca.yml b/examples/pythia/lora.yml
similarity index 56%
rename from configs/pythia_1_2B_alpaca.yml
rename to examples/pythia/lora.yml
index 52ed58cb5..e2b28f218 100644
--- a/configs/pythia_1_2B_alpaca.yml
+++ b/examples/pythia/lora.yml
@@ -1,36 +1,29 @@
 base_model: EleutherAI/pythia-1.4b-deduped
-model_type: GPTNeoXForCausalLM
-tokenizer_type: AutoTokenizer
+base_model_config: EleutherAI/pythia-1.4b-deduped
 load_in_8bit: true
 datasets:
-  - path: data/alpaca_data_gpt4.jsonl
+  - path: teknium/GPT4-LLM-Cleaned
     type: alpaca
-  - path: data/vicuna_cleaned.jsonl
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 adapter: lora
 lora_model_dir:
-sequence_len: 2048
-lora_r: 8
+sequence_len: 512
+lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
   - query_key_value
-#  - xxx
+lora_target_linear:
 lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: pythia-1.4b-lora
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
-output_dir: ./lora-alpaca
+output_dir: ./lora-alpaca-pythia
 gradient_accumulation_steps: 1
 micro_batch_size: 4
-num_epochs: 5
+num_epochs: 3
 learning_rate: 0.00001
 train_on_inputs: false
 group_by_length: false
@@ -39,3 +32,6 @@ tf32: True
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
+weight_decay: 0.1
+eval_steps: 20
+logging_steps: 1

From c530e4b9c877815b8f23a730014a10b81e206cdf Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 10:09:05 -0400
Subject: [PATCH 23/35] more config pruning and migrating

---
 configs/llama_7B_alpaca.yml                   | 41 ---------
 configs/sample.yml                            | 87 -------------------
 examples/gptj-qlora/config.yml                | 57 ++++++++++++
 .../jeopardy-bot/config.yml                   | 25 +++---
 4 files changed, 68 insertions(+), 142 deletions(-)
 delete mode 100644 configs/llama_7B_alpaca.yml
 delete mode 100644 configs/sample.yml
 create mode 100644 examples/gptj-qlora/config.yml
 rename configs/llama_7B_jeopardy.yml => examples/jeopardy-bot/config.yml (75%)

diff --git a/configs/llama_7B_alpaca.yml b/configs/llama_7B_alpaca.yml
deleted file mode 100644
index 7db2f65aa..000000000
--- a/configs/llama_7B_alpaca.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-base_model: huggyllama/llama-7b
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-load_in_8bit: true
-datasets:
-  - path: data/alpaca_data_gpt4.jsonl
-    type: alpaca
-  - path: data/vicuna_cleaned.jsonl
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.04
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: llama-7b-lora
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-llama-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-num_epochs: 5
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
diff --git a/configs/sample.yml b/configs/sample.yml
deleted file mode 100644
index ddd95cb55..000000000
--- a/configs/sample.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-# this is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# this can also be a relative path to a model on disk
-base_model: decapoda-research/llama-7b-hf-int4
-# you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
-base_model_ignore_patterns:
-# if the base_model repo on hf hub doesn't include configuration .json files,
-# you can set that here, or leave this empty to default to base_model
-base_model_config: decapoda-research/llama-7b-hf
-# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
-model_type: AutoModelForCausalLM
-# Corresponding tokenizer for the model AutoTokenizer is a good choice
-tokenizer_type: AutoTokenizer
-# whether you are training a 4-bit quantized model
-load_4bit: true
-# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-load_in_8bit: true
-# a list of one or more datasets to finetune the model with
-datasets:
-  # this can be either a hf dataset, or relative path
-  - path: vicgalle/alpaca-gpt4
-  # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca
-# axolotl attempts to save the dataset as an arrow after packing the data together so
-# subsequent training attempts load faster, relative path
-dataset_prepared_path: data/last_run_prepared
-# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc
-val_set_size: 0.04
-# if you want to use lora, leave blank to train all parameters in original model
-adapter: lora
-# if you already have a lora model trained that you want to load, put that here
-lora_model_dir:
-# the maximum length of an input to train with, this should typically be less than 2048
-# as most models have a token/context limit of 2048
-sequence_len: 2048
-# max sequence length to concatenate training samples together up to
-# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
-max_packed_sequence_len: 1024
-# lora hyperparameters
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-#  - k_proj
-#  - o_proj
-lora_fan_in_fan_out: false
-# wandb configuration if your're using it
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-# where to save the finsihed model to
-output_dir: ./completed-model
-# training hyperparameters
-gradient_accumulation_steps: 1
-batch_size:
-micro_batch_size: 2
-num_epochs: 3
-warmup_steps: 100
-learning_rate: 0.00003
-# whether to mask out or include the human's prompt from the training labels
-train_on_inputs: false
-# don't use this, leads to wonky training (according to someone on the internet)
-group_by_length: false
-# Use CUDA bf16
-bf16: true
-# Use CUDA tf32
-tf32: true
-# does not work with current implementation of 4-bit LoRA
-gradient_checkpointing: false
-# stop training after this many evaluation losses have increased in a row
-# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-early_stopping_patience: 3
-# specify a scheduler to use with the optimizer. only one_cycle is supported currently
-lr_scheduler:
-# whether to use xformers attention patch https://github.com/facebookresearch/xformers:
-xformers_attention:
-# whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
-flash_attention:
-# resume from a specific checkpoint dir
-resume_from_checkpoint:
-# if resume_from_checkpoint isn't set and you simply want it to start where it left off
-# be careful with this being turned on between different models
-auto_resume_from_checkpoints: false
-# don't mess with this, it's here for accelerate and torchrun
-local_rank:
diff --git a/examples/gptj-qlora/config.yml b/examples/gptj-qlora/config.yml
new file mode 100644
index 000000000..858c14862
--- /dev/null
+++ b/examples/gptj-qlora/config.yml
@@ -0,0 +1,57 @@
+base_model: EleutherAI/gpt-j-6b
+base_model_config: EleutherAI/gpt-j-6b
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+adapter: qlora
+lora_model_dir:
+sequence_len: 2048
+max_packed_sequence_len:
+lora_r: 8
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+lora_target_linear: true
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./qlora-out
+gradient_accumulation_steps: 2
+micro_batch_size: 2
+num_epochs: 2
+optimizer: paged_adamw_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0001
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: true
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|endoftext|>"
diff --git a/configs/llama_7B_jeopardy.yml b/examples/jeopardy-bot/config.yml
similarity index 75%
rename from configs/llama_7B_jeopardy.yml
rename to examples/jeopardy-bot/config.yml
index 287d6d6ab..b803c6074 100644
--- a/configs/llama_7B_jeopardy.yml
+++ b/examples/jeopardy-bot/config.yml
@@ -7,30 +7,28 @@ datasets:
   - path: openaccess-ai-collective/jeopardy
     type: jeopardy
 dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
+val_set_size: 0.02
 adapter:
 lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
+sequence_len: 512
+max_packed_sequence_len:
+lora_r:
+lora_alpha:
+lora_dropout:
 lora_target_modules:
-  - q_proj
-  - v_proj
 lora_fan_in_fan_out: false
-wandb_project: jeopardy-bot-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./jeopardy-bot-7b
-gradient_accumulation_steps: 2
+gradient_accumulation_steps: 1
 micro_batch_size: 1
-num_epochs: 2
+num_epochs: 3
 optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
-learning_rate: 0.0000002
+learning_rate: 0.00003
 train_on_inputs: false
 group_by_length: false
 bf16: true
@@ -48,11 +46,10 @@ eval_steps: 110
 save_steps: 660
 debug:
 deepspeed:
-weight_decay: 0.0001
+weight_decay: 0.1
 fsdp:
 fsdp_config:
 tokens:
-  pad_token: "[PAD]"
   bos_token: "<s>"
   eos_token: "</s>"
   unk_token: "<unk>"

From effbbf6dd13b564dcbafbbf155557bb43f76359a Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 10:38:24 -0400
Subject: [PATCH 24/35] more pruning

---
 configs/cerebras_1_3B_alpaca.yml | 40 ---------------------
 configs/galactica_1_3B.yml       | 41 ----------------------
 configs/gpt_neox_20b.yml         | 39 ---------------------
 configs/stability_3b.yml         | 56 -----------------------------
 examples/cerebras/qlora.yml      | 60 ++++++++++++++++++++++++++++++++
 5 files changed, 60 insertions(+), 176 deletions(-)
 delete mode 100644 configs/cerebras_1_3B_alpaca.yml
 delete mode 100644 configs/galactica_1_3B.yml
 delete mode 100644 configs/gpt_neox_20b.yml
 delete mode 100644 configs/stability_3b.yml
 create mode 100644 examples/cerebras/qlora.yml

diff --git a/configs/cerebras_1_3B_alpaca.yml b/configs/cerebras_1_3B_alpaca.yml
deleted file mode 100644
index 958bf4c5a..000000000
--- a/configs/cerebras_1_3B_alpaca.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-base_model: cerebras/Cerebras-GPT-1.3B
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: true
-datasets:
-  - path: data/alpaca_data_gpt4.jsonl
-    type: alpaca
-  - path: data/vicuna_cleaned.jsonl
-    type: sharegpt
-  - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
-    type: gpteacher
-  - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
-    type: gpteacher
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-adapter: lora
-sequence_len: 2048
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - c_attn
-lora_fan_in_fan_out: false
-wandb_project: pythia-1.4b-lora
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 4
-num_epochs: 5
-learning_rate: 0.0003
-train_on_inputs: false
-group_by_length: false
-bf16: True
-tf32: True
-gradient_checkpointing:
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
diff --git a/configs/galactica_1_3B.yml b/configs/galactica_1_3B.yml
deleted file mode 100644
index 2abb4c6b4..000000000
--- a/configs/galactica_1_3B.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-base_model: facebook/galactica-1.3b
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: false
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-adapter:
-lora_model_dir:
-sequence_len: 1024
-max_packed_sequence_len: 1024
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project:
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./lora-llama-alpaca
-gradient_accumulation_steps: 1
-micro_batch_size: 16
-num_epochs: 3
-learning_rate: 0.00003
-train_on_inputs: false
-group_by_length: false
-bf16: false
-tf32: false
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-tokens:
-  pad_token: "[PAD]"
-  bos_token: "<s>"
-  eos_token: "</s>"
-  unk_token: "<unk>"
diff --git a/configs/gpt_neox_20b.yml b/configs/gpt_neox_20b.yml
deleted file mode 100644
index 730afb72c..000000000
--- a/configs/gpt_neox_20b.yml
+++ /dev/null
@@ -1,39 +0,0 @@
-base_model: EleutherAI/gpt-neox-20b
-base_model_ignore_patterns: pytorch*  # prefer safetensors
-model_type: GPTNeoXForCausalLM
-tokenizer_type: AutoTokenizer
-load_in_8bit: true
-datasets:
-  - path: nomic-ai/gpt4all-j-prompt-generations
-    type: alpaca
-    shards: 4
-    shards_index: 0
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-adapter: lora
-lora_model_dir:
-sequence_len: 2048
-max_packed_sequence_len: 2048
-lora_r: 8
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - query_key_value
-lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
-wandb_project: gpt4all-neox-20b
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./gpt4all-neox-20b
-gradient_accumulation_steps: 1
-micro_batch_size: 4
-num_epochs: 5
-learning_rate: 0.00003
-lr_scheduler: one_cycle
-train_on_inputs: false
-group_by_length: false
-bf16: True
-tf32: True
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
diff --git a/configs/stability_3b.yml b/configs/stability_3b.yml
deleted file mode 100644
index 83516a20a..000000000
--- a/configs/stability_3b.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-base_model: stabilityai/stablelm-base-alpha-3b
-base_model_config: stabilityai/stablelm-base-alpha-3b
-load_in_8bit: false
-datasets:
-  - path: vicgalle/alpaca-gpt4
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.04
-adapter:
-lora_model_dir:
-sequence_len: 4096
-max_packed_sequence_len: 4096
-lora_r: 8
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules:
-  - q_proj
-  - v_proj
-lora_fan_in_fan_out: false
-wandb_project: stable-alpaca-3b
-wandb_watch:
-wandb_run_id:
-wandb_log_model:
-output_dir: ./stable-alpaca-3b
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-torchdistx_path:
-lr_scheduler: cosine
-learning_rate: 0.0000002
-train_on_inputs: false
-group_by_length: false
-bf16: true
-tf32: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention: true
-flash_attention:
-gptq_groupsize:
-gptq_model_v1:
-warmup_steps: 100
-eval_steps: 50
-save_steps: 200
-debug:
-deepspeed:
-weight_decay: 0.01
-fsdp:
-fsdp_config:
-#tokens:
-#  pad_token: "[PAD]"
-#  bos_token: "<s>"
-#  eos_token: "</s>"
-#  unk_token: "<unk>"
diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml
new file mode 100644
index 000000000..9340299b9
--- /dev/null
+++ b/examples/cerebras/qlora.yml
@@ -0,0 +1,60 @@
+base_model: cerebras/Cerebras-GPT-1.3B
+base_model_config: cerebras/Cerebras-GPT-1.3B
+load_in_8bit: false
+load_in_4bit: true
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+adapter: qlora
+lora_model_dir:
+sequence_len: 2048
+max_packed_sequence_len: 2048
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0.05
+lora_target_modules:
+  - c_fc
+  - c_attn
+  - c_proj
+lora_target_linear:
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./qlora-out
+batch_size: 4
+micro_batch_size: 4
+num_epochs: 2
+optimizer: paged_adamw_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: true
+bf16: true
+fp16: false
+tf32: true
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 20
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.1
+fsdp:
+fsdp_config:
+special_tokens:
+  pad_token: "<|endoftext|>"

From a43bae9ff06d927083f60e76caaae9338d43b613 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 10:44:03 -0400
Subject: [PATCH 25/35] update the support matrix

---
 README.md | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 180d97932..1c1e1f65a 100644
--- a/README.md
+++ b/README.md
@@ -16,13 +16,14 @@
 
 ## Axolotl supports
 
-|         | fp16/fp32 | fp16/fp32 w/ lora | qlora | 4bit-quant | 4bit-quant w/flash attention | flash attention | xformers attention |
-|---------|:----------|:------------------|------|------------|------------------------------|-----------------|--------------------|
-| llama   | ✅         | ✅                 | ✅  | ✅          | ✅                            | ✅               | ✅                  |
-| Pythia  | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
-| cerebras | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
-| mpt     | ✅         | ❌                 | ❓  | ❌          | ❌                            | ❌               | ❓                  |
-| falcon  | ✅         | ✅                 | ✅  | ❌          | ❌                            | ❌               | ❓                  |
+|          | fp16/fp32 | fp16/fp32 w/ lora | qlora | 4bit-quant | 4bit-quant w/flash attention | flash attention | xformers attention |
+|----------|:----------|:------------------|------|------------|------------------------------|----------------|-----------------|
+| llama    | ✅         | ✅                 | ✅  | ✅          | ✅                            | ✅              | ✅               |
+| Pythia   | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌              | ❓               |
+| cerebras | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌              | ✅                |
+| mpt      | ✅         | ❌                 | ❓  | ❌          | ❌                            | ❌              | ❓               |
+| falcon   | ✅         | ✅                 | ✅  | ❌          | ❌                            | ❌              | ✅                |
+| gpt-j    | ✅         | ✅                 | ✅  | ❌          | ❌                            | ❓               | ✅                |
 
 
 ## Quickstart ⚡

From 280832cec2e8c32aecfa525a161b6932ee4078f9 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 10:52:36 -0400
Subject: [PATCH 26/35] more matrix updates

---
 README.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 1c1e1f65a..0edeff447 100644
--- a/README.md
+++ b/README.md
@@ -17,13 +17,13 @@
 ## Axolotl supports
 
 |          | fp16/fp32 | fp16/fp32 w/ lora | qlora | 4bit-quant | 4bit-quant w/flash attention | flash attention | xformers attention |
-|----------|:----------|:------------------|------|------------|------------------------------|----------------|-----------------|
-| llama    | ✅         | ✅                 | ✅  | ✅          | ✅                            | ✅              | ✅               |
-| Pythia   | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌              | ❓               |
-| cerebras | ✅         | ✅                 | ❓  | ❌          | ❌                            | ❌              | ✅                |
-| mpt      | ✅         | ❌                 | ❓  | ❌          | ❌                            | ❌              | ❓               |
-| falcon   | ✅         | ✅                 | ✅  | ❌          | ❌                            | ❌              | ✅                |
-| gpt-j    | ✅         | ✅                 | ✅  | ❌          | ❌                            | ❓               | ✅                |
+|----------|:----------|:------------------|-------|------------|------------------------------|-----------------|--------------------|
+| llama    | ✅         | ✅                 | ✅     | ✅          | ✅                            | ✅               | ✅                  |
+| Pythia   | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❌               | ❓                  |
+| cerebras | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❌               | ✅                  |
+| mpt      | ✅         | ❌                 | ❓     | ❌          | ❌                            | ❌               | ❓                  |
+| falcon   | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❌               | ✅                  |
+| gpt-j    | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❓               | ✅                  |
 
 
 ## Quickstart ⚡

From a6ebf57e827ff1d9c41238bf606ce7c3f7338f98 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 10:55:32 -0400
Subject: [PATCH 27/35] fix table formatting

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 0edeff447..8d201e739 100644
--- a/README.md
+++ b/README.md
@@ -16,14 +16,14 @@
 
 ## Axolotl supports
 
-|          | fp16/fp32 | fp16/fp32 w/ lora | qlora | 4bit-quant | 4bit-quant w/flash attention | flash attention | xformers attention |
-|----------|:----------|:------------------|-------|------------|------------------------------|-----------------|--------------------|
-| llama    | ✅         | ✅                 | ✅     | ✅          | ✅                            | ✅               | ✅                  |
-| Pythia   | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❌               | ❓                  |
-| cerebras | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❌               | ✅                  |
-| mpt      | ✅         | ❌                 | ❓     | ❌          | ❌                            | ❌               | ❓                  |
-| falcon   | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❌               | ✅                  |
-| gpt-j    | ✅         | ✅                 | ✅     | ❌          | ❌                            | ❓               | ✅                  |
+|          | fp16/fp32 | fp16/fp32 w/ lora | qlora | gptq | gptq w/ lora | gptq w/flash attention | flash attention | xformers attention |
+|----------|:----------|:------------------|-------|------|:-------------|------------------------|-----------------|--------------------|
+| llama    | ✅         | ✅                 | ✅     | ✅    | ❓            | ✅                      | ✅               | ✅                  |
+| Pythia   | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❌               | ❓                  |
+| cerebras | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❌               | ✅                  |
+| mpt      | ✅         | ❌                 | ❓     | ❌    | ❓            | ❌                      | ❌               | ❓                  |
+| falcon   | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❌               | ✅                  |
+| gpt-j    | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❓               | ✅                  |
 
 
 ## Quickstart ⚡

From d0d7eaa4f347c9fc6ba267d1966b83ea6f048a96 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 11:03:31 -0400
Subject: [PATCH 28/35] update openllama and clean up paths

---
 README.md                                        | 16 ++++++++--------
 .../{gptj-qlora/config.yml => gptj/qlora.yml}    |  0
 examples/openllama-3b/README.md                  | 16 ++++++++++++++++
 .../config.yml => openllama-3b/lora.yml}         |  4 ++--
 .../config.yml => openllama-3b/qlora.yml}        |  4 ++--
 examples/qlora-openllama-3b/README.md            |  6 ------
 6 files changed, 28 insertions(+), 18 deletions(-)
 rename examples/{gptj-qlora/config.yml => gptj/qlora.yml} (100%)
 create mode 100644 examples/openllama-3b/README.md
 rename examples/{lora-openllama-3b/config.yml => openllama-3b/lora.yml} (90%)
 rename examples/{qlora-openllama-3b/config.yml => openllama-3b/qlora.yml} (90%)
 delete mode 100644 examples/qlora-openllama-3b/README.md

diff --git a/README.md b/README.md
index 8d201e739..a31eee5fb 100644
--- a/README.md
+++ b/README.md
@@ -16,14 +16,14 @@
 
 ## Axolotl supports
 
-|          | fp16/fp32 | fp16/fp32 w/ lora | qlora | gptq | gptq w/ lora | gptq w/flash attention | flash attention | xformers attention |
-|----------|:----------|:------------------|-------|------|:-------------|------------------------|-----------------|--------------------|
-| llama    | ✅         | ✅                 | ✅     | ✅    | ❓            | ✅                      | ✅               | ✅                  |
-| Pythia   | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❌               | ❓                  |
-| cerebras | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❌               | ✅                  |
-| mpt      | ✅         | ❌                 | ❓     | ❌    | ❓            | ❌                      | ❌               | ❓                  |
-| falcon   | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❌               | ✅                  |
-| gpt-j    | ✅         | ✅                 | ✅     | ❌    | ❓            | ❌                      | ❓               | ✅                  |
+|          | fp16/fp32 | lora | qlora | gptq | gptq w/ lora | gptq w/flash attn | flash attn | xformers attn |
+|----------|:----------|:-----|-------|------|:-------------|-------------------|------------|---------------|
+| llama    | ✅         | ✅    | ✅     | ✅    | ❓            | ✅                 | ✅          | ✅             |
+| Pythia   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ❓             |
+| cerebras | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
+| mpt      | ✅         | ❌    | ❓     | ❌    | ❓            | ❌                 | ❌          | ❓             |
+| falcon   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
+| gpt-j    | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❓          | ✅             |
 
 
 ## Quickstart ⚡
diff --git a/examples/gptj-qlora/config.yml b/examples/gptj/qlora.yml
similarity index 100%
rename from examples/gptj-qlora/config.yml
rename to examples/gptj/qlora.yml
diff --git a/examples/openllama-3b/README.md b/examples/openllama-3b/README.md
new file mode 100644
index 000000000..9e8f3a9e8
--- /dev/null
+++ b/examples/openllama-3b/README.md
@@ -0,0 +1,16 @@
+# openllama-3b
+
+Basic full tune
+```shell
+accelerate launch scripts/finetune.py examples/qlora-openllama-3b/config.yml
+```
+
+LoRA
+```shell
+accelerate launch scripts/finetune.py examples/qlora-openllama-3b/lora.yml
+```
+
+QLoRA
+```shell
+accelerate launch scripts/finetune.py examples/qlora-openllama-3b/qlora.yml
+```
diff --git a/examples/lora-openllama-3b/config.yml b/examples/openllama-3b/lora.yml
similarity index 90%
rename from examples/lora-openllama-3b/config.yml
rename to examples/openllama-3b/lora.yml
index 2e1644546..98e2c2adc 100644
--- a/examples/lora-openllama-3b/config.yml
+++ b/examples/openllama-3b/lora.yml
@@ -1,5 +1,5 @@
-base_model: openlm-research/open_llama_3b_600bt_preview
-base_model_config: openlm-research/open_llama_3b_600bt_preview
+base_model: openlm-research/open_llama_3b
+base_model_config: openlm-research/open_llama_3b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
diff --git a/examples/qlora-openllama-3b/config.yml b/examples/openllama-3b/qlora.yml
similarity index 90%
rename from examples/qlora-openllama-3b/config.yml
rename to examples/openllama-3b/qlora.yml
index 87e1dfd94..83ae31f91 100644
--- a/examples/qlora-openllama-3b/config.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -1,5 +1,5 @@
-base_model: openlm-research/open_llama_3b_600bt_preview
-base_model_config: openlm-research/open_llama_3b_600bt_preview
+base_model: openlm-research/open_llama_3b
+base_model_config: openlm-research/open_llama_3b
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: false
diff --git a/examples/qlora-openllama-3b/README.md b/examples/qlora-openllama-3b/README.md
deleted file mode 100644
index d79ea7f3f..000000000
--- a/examples/qlora-openllama-3b/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# qlora-openllama-3b
-
-```shell
-accelerate launch scripts/finetune.py examples/qlora-openllama-3b/config.yml
-
-```

From 336aa3fd487a8c35b7637fbbddac81a67d078a41 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 11:05:29 -0400
Subject: [PATCH 29/35] gptq lora llama is obviously good

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a31eee5fb..349dd370a 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@
 
 |          | fp16/fp32 | lora | qlora | gptq | gptq w/ lora | gptq w/flash attn | flash attn | xformers attn |
 |----------|:----------|:-----|-------|------|:-------------|-------------------|------------|---------------|
-| llama    | ✅         | ✅    | ✅     | ✅    | ❓            | ✅                 | ✅          | ✅             |
+| llama    | ✅         | ✅    | ✅     | ✅    | ✅             | ✅                 | ✅          | ✅             |
 | Pythia   | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ❓             |
 | cerebras | ✅         | ✅    | ✅     | ❌    | ❓            | ❌                 | ❌          | ✅             |
 | mpt      | ✅         | ❌    | ❓     | ❌    | ❓            | ❌                 | ❌          | ❓             |

From 6b3f509d9e14e58369a7c4322de78a46a90924ae Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 11:50:12 -0400
Subject: [PATCH 30/35] forgot to add this file

---
 examples/openllama-3b/config.yml | 61 ++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)
 create mode 100644 examples/openllama-3b/config.yml

diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml
new file mode 100644
index 000000000..248b740ff
--- /dev/null
+++ b/examples/openllama-3b/config.yml
@@ -0,0 +1,61 @@
+base_model: openlm-research/open_llama_3b
+base_model_config: openlm-research/open_llama_3b
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.02
+adapter:
+lora_model_dir:
+sequence_len: 256
+max_packed_sequence_len:
+lora_r:
+lora_alpha:
+lora_dropout:
+lora_target_modules:
+lora_target_linear:
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./lora-out
+batch_size: 16
+micro_batch_size: 4
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: false
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention:
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 50
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"

From 4cd1deeef2203c628025439356df463d3174569c Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Mon, 12 Jun 2023 02:44:46 +0900
Subject: [PATCH 31/35] Add save_steps and eval_steps to Readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 349dd370a..8ba63299e 100644
--- a/README.md
+++ b/README.md
@@ -382,6 +382,8 @@ num_epochs: 3
 warmup_steps: 100
 learning_rate: 0.00003
 logging_steps:
+save_steps:
+eval_steps:
 
 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false

From aac4b7691eb81b868d840a8d9ad0725dc0e70071 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 18:46:26 -0400
Subject: [PATCH 32/35] add new sharegpt, refactor prompt so it can be
 customized later, add exception if no data is processed

---
 README.md                                     | 10 +++++-
 src/axolotl/datasets.py                       |  4 +++
 .../prompt_strategies/sharegpt_jokes.py       | 28 ++++++++++++++++
 .../prompt_strategies/sharegpt_simple.py      | 21 ++++++++++++
 src/axolotl/prompters.py                      | 33 +++++++++++--------
 src/axolotl/utils/data.py                     | 11 +++++--
 6 files changed, 90 insertions(+), 17 deletions(-)
 create mode 100644 src/axolotl/prompt_strategies/sharegpt_jokes.py

diff --git a/README.md b/README.md
index 8ba63299e..10586afaf 100644
--- a/README.md
+++ b/README.md
@@ -219,6 +219,14 @@ Have dataset(s) in one of the following format (JSONL recommended):
   ```json
   {"conversations": [{"role": "...", "value": "..."}]}
   ```
+- `sharegpt_simple.load_role`: conversations where `role` is used instead of `from`
+  ```json
+  {"conversations": [{"role": "...", "value": "..."}]}
+  ```
+- `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
+  ```json
+  {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
+  ```
 
 </details>
 
@@ -530,7 +538,7 @@ Try set `fp16: true`
 
 Try to turn off xformers.
 
-## Need help? 🙋‍♂️
+## Need help? 🙋♂️
 
 Join our [Discord server](https://discord.gg/HhrNrHJPRb) where we can help you
 
diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py
index d6367ce7c..40c58bc9c 100644
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -33,12 +33,16 @@ class TokenizedPromptDataset(IterableDataset):
 
     def __iter__(self):
         iterator = iter(self.dataset)
+        count = 0
         # Loop through the entire dataset
         for example in iterator:
             try:
                 yield self.prompt_tokenizer.tokenize_prompt(example)
+                count += 1
             except InvalidDataException:
                 pass
+        if count == 0:
+            raise RuntimeError("Expected at least one datapoint in dataset.")
 
 
 # TODO this isn't the best since it can't interleave datasets
diff --git a/src/axolotl/prompt_strategies/sharegpt_jokes.py b/src/axolotl/prompt_strategies/sharegpt_jokes.py
new file mode 100644
index 000000000..ac424bf7c
--- /dev/null
+++ b/src/axolotl/prompt_strategies/sharegpt_jokes.py
@@ -0,0 +1,28 @@
+"""Module for Jokes prompts using sharegpt style """
+from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
+from axolotl.prompters import PromptStyle, ShareGPTPrompter
+
+
+def load(tokenizer, cfg):
+    return SimpleJokesShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
+class SimpleJokesShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    Tokenization strategy for asking bot to tell a joke and then explain why its funny
+    """
+
+    # title, text, explanation
+    def get_conversation_thread(self, prompt):
+        title = "" if not prompt["title"] else prompt["title"] + " "
+        return [
+            {"from": "human", "value": "Tell me a joke."},
+            {"from": "gpt", "value": title + prompt["text"]},
+            {"from": "human", "value": "Why is that joke funny?"},
+            {"from": "gpt", "value": prompt["explanation"]},
+        ]
diff --git a/src/axolotl/prompt_strategies/sharegpt_simple.py b/src/axolotl/prompt_strategies/sharegpt_simple.py
index 4346663f2..bfe0d164b 100644
--- a/src/axolotl/prompt_strategies/sharegpt_simple.py
+++ b/src/axolotl/prompt_strategies/sharegpt_simple.py
@@ -13,6 +13,15 @@ def load(tokenizer, cfg):
     )
 
 
+def load_role(tokenizer, cfg):
+    return SimpleRoleShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompter(PromptStyle.CHAT.value),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
 def load_guanaco(tokenizer, cfg):
     return GuanacoShareGPTPromptTokenizingStrategy(
         ShareGPTPrompter(PromptStyle.CHAT.value),
@@ -31,6 +40,18 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
         return prompt["conversations"]
 
 
+class SimpleRoleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
+    """
+    basic sharegpt strategy to grab conversations from the sample row, but uses role instead of from
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversations = prompt["conversations"]
+        # remap role: prompter/assistant, text: ... => from: human/gpt, value: ...
+        turns = [{"from": t["role"], "value": t["value"]} for t in conversations]
+        return turns
+
+
 class GuanacoShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
     """
     sharegpt strategy that remaps oasst data to sharegpt format
diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index 39c74023b..6ec8ad0e0 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -261,28 +261,33 @@ class Conversation:
         self.messages.append([role, message])
 
 
-conv_vicuna_v1_1 = Conversation(
-    system="A chat between a curious user and an artificial intelligence assistant. "
-    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
-    roles=["USER", "ASSISTANT"],
-    messages=[],
-    offset=0,
-    sep_style=SeparatorStyle.TWO,
-    sep=" ",
-    sep2=" ",
-)
-
-
 class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
     """
     A prompter that generates prompts for the ShareGPT
     """
 
-    def __init__(self, prompt_style=None):
+    def __init__(self, prompt_style=None, system_prompt=None):
         if prompt_style != PromptStyle.CHAT.value:
             raise ValueError(
                 f"unsupported prompt_style for ShareGPTPrompter({prompt_style})"
             )
+        system = (
+            system_prompt
+            if system_prompt
+            else (
+                "A chat between a curious user and an artificial intelligence assistant. "
+                "The assistant gives helpful, detailed, and polite answers to the user's questions."
+            )
+        )
+        self._conversation = Conversation(
+            system=system,
+            roles=["USER", "ASSISTANT"],
+            messages=[],
+            offset=0,
+            sep_style=SeparatorStyle.TWO,
+            sep=" ",
+            sep2=" ",
+        )
 
     # def match_prompt_style(self):
     #     if self.prompt_style == PromptStyle.chat.value:
@@ -300,7 +305,7 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
             # also happens on the data splitting leaving empty conversations
             raise IndexError
 
-        conv = conv_vicuna_v1_1.copy()
+        conv = self._conversation.copy()
         roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 
         try:
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index cba964076..9fee2fb9b 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -239,8 +239,15 @@ def load_tokenized_prepared_datasets(
                 ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
                 datasets.append(ds_wrapper)
             else:
-                logging.error(f"unhandled prompt tokenization strategy: {d.type}")
-                raise ValueError(f"unhandled prompt tokenization strategy: {d.type}")
+                suffix = ""
+                if ":load_" in d.type:
+                    suffix = f" Did you mean {d.type.replace(':load_', '.load_')}?"
+                logging.error(
+                    f"unhandled prompt tokenization strategy: {d.type}. {suffix}"
+                )
+                raise ValueError(
+                    f"unhandled prompt tokenization strategy: {d.type} {suffix}"
+                )
         logging.info("tokenizing, merging, and shuffling master dataset")
 
         samples: List[int] = []

From c7dee56b8772830c243c30bd9c062659223eb712 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 11 Jun 2023 19:52:34 -0400
Subject: [PATCH 33/35] add typehints

---
 src/axolotl/prompters.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index 6ec8ad0e0..29cc4446b 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -266,12 +266,12 @@ class ShareGPTPrompter:  # pylint: disable=too-few-public-methods
     A prompter that generates prompts for the ShareGPT
     """
 
-    def __init__(self, prompt_style=None, system_prompt=None):
+    def __init__(self, prompt_style=None, system_prompt: Optional[str] = None):
         if prompt_style != PromptStyle.CHAT.value:
             raise ValueError(
                 f"unsupported prompt_style for ShareGPTPrompter({prompt_style})"
             )
-        system = (
+        system: str = (
             system_prompt
             if system_prompt
             else (

From 9a58e99e812f1e4074da02ff56529b4986563931 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Mon, 12 Jun 2023 01:52:58 -0400
Subject: [PATCH 34/35] config fixes

---
 examples/falcon/config-7b-lora.yml | 2 +-
 examples/falcon/config-7b.yml      | 2 +-
 examples/openllama-3b/config.yml   | 4 ++--
 examples/openllama-3b/lora.yml     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml
index 090cc6bcf..8aa585851 100644
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -23,7 +23,7 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project: falcon-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml
index dc67d6125..b267566ce 100644
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -23,7 +23,7 @@ lora_dropout: 0.0
 lora_target_modules:
 lora_target_linear: true
 lora_fan_in_fan_out:
-wandb_project: falcon-7b
+wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml
index 248b740ff..6fd704ffc 100644
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -25,7 +25,7 @@ wandb_project:
 wandb_watch:
 wandb_run_id:
 wandb_log_model:
-output_dir: ./lora-out
+output_dir: ./openllama-out
 batch_size: 16
 micro_batch_size: 4
 num_epochs: 3
@@ -43,7 +43,7 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention:
+xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml
index 98e2c2adc..d1f252455 100644
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -49,7 +49,7 @@ early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
-xformers_attention:
+xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:

From 52cde69288d5616ba8e7fd7a262b1247523eba38 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Mon, 12 Jun 2023 17:06:15 +0900
Subject: [PATCH 35/35] Fix config path after config moved

---
 README.md                       | 4 ++--
 examples/gptq-lora-7b/README.md | 2 +-
 examples/openllama-3b/README.md | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 8ba63299e..7ebea8678 100644
--- a/README.md
+++ b/README.md
@@ -39,10 +39,10 @@ pip3 install -U git+https://github.com/huggingface/peft.git
 accelerate config
 
 # finetune lora
-accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml
 
 # inference
-accelerate launch scripts/finetune.py examples/lora-openllama-3b/config.yml \
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
     --inference --lora_model_dir="./lora-out"
 ```
 
diff --git a/examples/gptq-lora-7b/README.md b/examples/gptq-lora-7b/README.md
index eefe98d3f..0bde51b06 100644
--- a/examples/gptq-lora-7b/README.md
+++ b/examples/gptq-lora-7b/README.md
@@ -3,6 +3,6 @@
 This is a good place to start for beginners. This will run on an NVIDIA RTX4090 with no other changes needed.
 
 ```shell
-accelerate launch scripts/finetune.py examples/4bit-lora-7b/config.yml
+accelerate launch scripts/finetune.py examples/gptq-lora-7b/config.yml
 
 ```
diff --git a/examples/openllama-3b/README.md b/examples/openllama-3b/README.md
index 9e8f3a9e8..3e9501a54 100644
--- a/examples/openllama-3b/README.md
+++ b/examples/openllama-3b/README.md
@@ -2,15 +2,15 @@
 
 Basic full tune
 ```shell
-accelerate launch scripts/finetune.py examples/qlora-openllama-3b/config.yml
+accelerate launch scripts/finetune.py examples/openllama-3b/config.yml
 ```
 
 LoRA
 ```shell
-accelerate launch scripts/finetune.py examples/qlora-openllama-3b/lora.yml
+accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml
 ```
 
 QLoRA
 ```shell
-accelerate launch scripts/finetune.py examples/qlora-openllama-3b/qlora.yml
+accelerate launch scripts/finetune.py examples/openllama-3b/qlora.yml
 ```