From 1d7da3b389424bcd0b3274daf96bacb35c89334c Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 09:58:29 -0400
Subject: [PATCH 01/21] add missing file

---
 src/axolotl/utils/validation.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 src/axolotl/utils/validation.py
diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py
new file mode 100644
index 000000000..9bef37406
--- /dev/null
+++ b/src/axolotl/utils/validation.py
@@ -0,0 +1,10 @@
+def validate_config(cfg):
+    if cfg.adapter == "qlora":
+        assert cfg.load_in_8bit is False
+        assert cfg.load_4bit is False
+        assert cfg.load_in_4bit is True
+    pass
+    # TODO
+    # MPT 7b
+    # https://github.com/facebookresearch/bitsandbytes/issues/25
+    # no 8bit adamw w bf16

From 85522184917cd701863147aebcc5619efbf9bf06 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:04:44 +0900
Subject: [PATCH 02/21] Improve Inference instruction

---
 README.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 969708d47..531270dcc 100644
--- a/README.md
+++ b/README.md
@@ -317,12 +317,16 @@ accelerate launch scripts/finetune.py configs/your_config.yml
 
 ### Inference
 
-Add `--inference` flag to train command above
+Pass the appropriate flag to the train command:
 
-If you are inferencing a pretrained LORA, pass 
-```bash
---lora_model_dir ./completed-model
-```
+- Pretrained LORA:
+  ```bash
+  --inference --lora_model_dir ./completed-model
+  ```
+- Full weights finetune:
+  ```bash
+  --inference --base_model ./completed-model
+  ```
 
 ### Merge LORA to base
 

From 9083910036c0d7daf56e8f7a1f8309694f6704ff Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:26:39 +0900
Subject: [PATCH 03/21] Update lora config

---
 README.md | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 531270dcc..c51d7101c 100644
--- a/README.md
+++ b/README.md
@@ -134,7 +134,7 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
 
 - lora
   ```yaml
-  adapter: lora # blank for full finetune
+  adapter: lora # qlora or leave blank for full finetune
   lora_r: 8
   lora_alpha: 16
   lora_dropout: 0.05
@@ -185,6 +185,8 @@ datasets:
   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
     type: alpaca
     data_files: # path to source data files
+    shards: # true if use subset data. make sure to set `shards` param also
+shards: # number of shards to split dataset into
 
 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
@@ -201,7 +203,7 @@ sequence_len: 2048
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 max_packed_sequence_len: 1024
 
-# if you want to use lora, leave blank to train all parameters in original model
+# if you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
 adapter: lora
 # if you already have a lora model trained that you want to load, put that here
 # lora hyperparameters
@@ -293,6 +295,9 @@ torchdistx_path:
 
 # Debug mode
 debug:
+
+# Seed
+seed:
 ```
 
 </details>

From 5b712afbe4e30ddabe4b9d1b219a40c1c331c44f Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:28:03 +0900
Subject: [PATCH 04/21] Update bf16 options

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c51d7101c..f02ae20a0 100644
--- a/README.md
+++ b/README.md
@@ -172,7 +172,7 @@ gptq_model_v1: false # v1 or v2
 load_in_8bit: true
 
 # Use CUDA bf16
-bf16: true
+bf16: true # bool or 'full' for `bf16_full_eval`
 # Use CUDA fp16
 fp16: true
 # Use CUDA tf32

From 05c18340d69fd546c881f8e2968e5a8775e22a95 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:32:03 +0900
Subject: [PATCH 05/21] Update scheduler configs

---
 README.md | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f02ae20a0..1beceda7a 100644
--- a/README.md
+++ b/README.md
@@ -254,8 +254,18 @@ gradient_checkpointing: false
 # stop training after this many evaluation losses have increased in a row
 # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
 early_stopping_patience: 3
-# specify a scheduler to use with the optimizer. only one_cycle is supported currently
-lr_scheduler:
+
+# specify a scheduler and kwargs to use with the optimizer
+lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+lr_scheduler_kwargs:
+
+# for one_cycle optim
+lr_div_factor: # learning rate div factor
+
+# for log_sweep optim
+log_sweep_min_lr:
+log_sweep_max_lr:
+
 # specify optimizer
 optimizer:
 # specify weight decay

From 29273b5a5b42f5efa9290d8c3987697e8b7375f7 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:34:43 +0900
Subject: [PATCH 06/21] Add other minor configs

---
 README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1beceda7a..407e03ed3 100644
--- a/README.md
+++ b/README.md
@@ -226,6 +226,7 @@ lora_out_dir:
 lora_fan_in_fan_out: false
 
 # wandb configuration if you're using it
+wandb_mode:
 wandb_project:
 wandb_watch:
 wandb_run_id:
@@ -300,9 +301,12 @@ fsdp_config:
 # Deepspeed
 deepspeed:
 
-# TODO
+# Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
 
+# Set padding for data collator to 'longest'
+collator_pad_to_longest:
+
 # Debug mode
 debug:
 

From 7bc28eb8a8bf8786001c80afc991a8d4d4145b9c Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:43:37 +0900
Subject: [PATCH 07/21] Add more data formats

---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index 407e03ed3..1bab0722f 100644
--- a/README.md
+++ b/README.md
@@ -97,6 +97,17 @@ Have dataset(s) in one of the following format (JSONL recommended):
   ```json
   {"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
   ```
+- `explainchoice`: question, choices, (solution OR explanation)
+  ```json
+  {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+  ```
+- `concisechoice`: question, choices, (solution OR explanation)
+  ```json
+  {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+- `summarizetldr`: article and summary
+  ```json
+  {"article": "...", "summary": "..."}
+  ```
 
 > Have some new format to propose? Check if it's already defined in [data.py](src/axolotl/utils/data.py) in `dev` branch!
 

From 2c34f8d0c769341ce6c0b4eb3fb4291f77fcf581 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:44:58 +0900
Subject: [PATCH 08/21] Update dataset type

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1bab0722f..1f90be3d6 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,7 @@ datasets:
   # this can be either a hf dataset, or relative path
   - path: vicgalle/alpaca-gpt4
   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca
+    type: alpaca # format OR format:prompt_style (chat/instruct)
     data_files: # path to source data files
     shards: # true if use subset data. make sure to set `shards` param also
 shards: # number of shards to split dataset into

From 1377400c333d92769b0165eb1b3da850c9a71ad8 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 22:33:45 +0900
Subject: [PATCH 09/21] Add info on Runtime Error

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 1f90be3d6..e039fa639 100644
--- a/README.md
+++ b/README.md
@@ -375,6 +375,10 @@ Please reduce any below
   - `eval_batch_size`
   - `sequence_len`
 
+> RuntimeError: expected scalar type Float but found Half
+
+Try set `fp16: true`
+
 ## Contributing 🤝
 
 Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).

From e65c203e9e7b799c526c43d5ba87d94d26a09e14 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 22:45:58 +0900
Subject: [PATCH 10/21] Add more detail on minimum GPU

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index e039fa639..5a95eb474 100644
--- a/README.md
+++ b/README.md
@@ -135,11 +135,11 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
 
 - loading
   ```yaml
-  load_4bit: true
   load_in_8bit: true
-  bf16: true
+  load_in_8bit: true
+  bf16: true # require >=ampere
   fp16: true
-  tf32: true
+  tf32: true # require >=ampere
   ```
   Note: Repo does not do 4-bit quantization.
 
@@ -183,11 +183,11 @@ gptq_model_v1: false # v1 or v2
 load_in_8bit: true
 
 # Use CUDA bf16
-bf16: true # bool or 'full' for `bf16_full_eval`
+bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
 # Use CUDA fp16
 fp16: true
 # Use CUDA tf32
-tf32: true
+tf32: true # require >=ampere
 
 # a list of one or more datasets to finetune the model with
 datasets:
@@ -286,7 +286,7 @@ weight_decay:
 # whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 xformers_attention:
 # whether to use flash attention patch https://github.com/HazyResearch/flash-attention:
-flash_attention:
+flash_attention:  # require a100 for llama
 
 # resume from a specific checkpoint dir
 resume_from_checkpoint:

From f92245dbd65141ba1d0a5f9c2cf12504107da789 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 23:04:33 +0900
Subject: [PATCH 11/21] Fix missing closing code block

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5a95eb474..110fec95d 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 - `concisechoice`: question, choices, (solution OR explanation)
   ```json
   {"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
+  ```
 - `summarizetldr`: article and summary
   ```json
   {"article": "...", "summary": "..."}

From 52fb6d8a34203229c07e3301cdbaac9026b65861 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 17:48:49 +0900
Subject: [PATCH 12/21] Update gitignore using standard Python template

---
 .gitignore | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 161 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index b7a09516c..93a4f81b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,163 @@
 **/axolotl.egg-info
-**/__pycache__
-.idea
 configs
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
\ No newline at end of file

From a9e502ef45f29773986eda92eabb601477b12eb2 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 25 May 2023 23:48:18 +0900
Subject: [PATCH 13/21] Update 4bit notes

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 110fec95d..79e583c71 100644
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
 
 - loading
   ```yaml
-  load_in_8bit: true
+  load_in_4bit: true
   load_in_8bit: true
   bf16: true # require >=ampere
   fp16: true
@@ -175,13 +175,15 @@ tokenizer_type: AutoTokenizer
 # Trust remote code for untrusted source
 trust_remote_code:
 
-# whether you are training a 4-bit quantized model
+# whether you are training a 4-bit GPTQ quantized model
 load_4bit: true
 gptq_groupsize: 128 # group size
 gptq_model_v1: false # v1 or v2
 
 # this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 load_in_8bit: true
+# use bitsandbytes 4 bit
+load_in_4bit:
 
 # Use CUDA bf16
 bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere

From f5fa3d131b33c1fedf963a0f6e8c4fa0aae8a10f Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 11:29:15 -0400
Subject: [PATCH 14/21] fix cd within flash-attn

---
 docker/Dockerfile-base | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 943bae3b0..5d84a7c24 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -43,11 +43,11 @@ RUN git clone https://github.com/HazyResearch/flash-attention.git && \
     python3 setup.py bdist_wheel && \
     cd csrc/fused_dense_lib && \
     python3 setup.py bdist_wheel && \
-    cd csrc/xentropy && \
+    cd ../csrc/xentropy && \
     python3 setup.py bdist_wheel && \
-    cd csrc/rotary && \
+    cd ../csrc/rotary && \
     python3 setup.py bdist_wheel && \
-    cd csrc/layer_norm && \
+    cd ../csrc/layer_norm && \
     python3 setup.py bdist_wheel
 
 FROM base-builder AS deepspeed-builder

From a5d739b66b5fc7123d085a99a2b9e0a9dd5df92f Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 11:59:08 -0400
Subject: [PATCH 15/21] fixes w/ example for super basic lora starter

---
 examples/lora-alpaca-7b/config.yml | 67 ++++++++++++++++++++++++++++++
 src/axolotl/prompters.py           |  2 +-
 src/axolotl/utils/data.py          | 10 +++--
 3 files changed, 74 insertions(+), 5 deletions(-)
 create mode 100644 examples/lora-alpaca-7b/config.yml

diff --git a/examples/lora-alpaca-7b/config.yml b/examples/lora-alpaca-7b/config.yml
new file mode 100644
index 000000000..0499b265f
--- /dev/null
+++ b/examples/lora-alpaca-7b/config.yml
@@ -0,0 +1,67 @@
+base_model: huggyllama/llama-7b
+base_model_config: huggyllama/llama-7b
+model_type: LlamaForCausalLM
+tokenizer_type: LlamaTokenizer
+load_in_8bit: true
+load_in_4bit: false
+strict: false
+push_dataset_to_hub:
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.02
+adapter: lora
+lora_model_dir:
+sequence_len: 512
+max_packed_sequence_len:
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.0
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+lora_fan_in_fan_out:
+wandb_project:
+wandb_watch:
+wandb_run_id:
+wandb_log_model:
+output_dir: ./lora-out
+batch_size: 4
+micro_batch_size: 1
+num_epochs: 4
+optimizer: adamw_bnb_8bit
+torchdistx_path:
+lr_scheduler: cosine
+learning_rate: 0.0002
+train_on_inputs: false
+group_by_length: false
+bf16: false
+fp16: true
+tf32: true
+gradient_checkpointing: true
+early_stopping_patience:
+resume_from_checkpoint:
+local_rank:
+logging_steps: 1
+xformers_attention: true
+flash_attention:
+gptq_groupsize:
+gptq_model_v1:
+warmup_steps: 10
+eval_steps: 50
+save_steps:
+debug:
+deepspeed:
+weight_decay: 0.0
+fsdp:
+fsdp_config:
+special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index a6d237a11..df37ec85a 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -18,7 +18,7 @@ class AlpacaPrompter:
     prompt_style = None
 
     def __init__(self, prompt_style="instruct"):
-        self.prompt_style = prompt_style
+        self.prompt_style = prompt_style if prompt_style else PromptStyle.instruct.value
         self.match_prompt_style()
 
     def match_prompt_style(self):
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 2f9a1afec..78f23fd52 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -60,10 +60,12 @@ def load_tokenized_prepared_datasets(
         else Path(default_dataset_prepared_path) / ds_hash
     )
     dataset = None
+    use_auth_token = False
     try:
         if cfg.push_dataset_to_hub:
+            use_auth_token = True
             dataset = load_dataset(
-                f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True
+                f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=use_auth_token
             )
             dataset = dataset["train"]
     except:
@@ -83,7 +85,7 @@ def load_tokenized_prepared_datasets(
             ds = None
             ds_from_hub = False
             try:
-                load_dataset(d.path, streaming=True, use_auth_token=True)
+                load_dataset(d.path, streaming=True, use_auth_token=use_auth_token)
                 ds_from_hub = True
             except FileNotFoundError:
                 pass
@@ -99,10 +101,10 @@ def load_tokenized_prepared_datasets(
                         d.path,
                         streaming=False,
                         data_files=d.data_files,
-                        use_auth_token=True,
+                        use_auth_token=use_auth_token,
                     )
                 else:
-                    ds = load_dataset(d.path, streaming=False, use_auth_token=True)
+                    ds = load_dataset(d.path, streaming=False, use_auth_token=use_auth_token)
             else:
                 fp = hf_hub_download(
                     repo_id=d.path, repo_type="dataset", filename=d.data_files

From e3966543199c23df068f37cce18f73defa43cdb7 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 12:15:12 -0400
Subject: [PATCH 16/21] fix tokenizer loading, got openllama 3b working

---
 .../{lora-alpaca-7b => lora-openllama-3b}/config.yml   | 10 +++++-----
 src/axolotl/utils/models.py                            |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename examples/{lora-alpaca-7b => lora-openllama-3b}/config.yml (86%)

diff --git a/examples/lora-alpaca-7b/config.yml b/examples/lora-openllama-3b/config.yml
similarity index 86%
rename from examples/lora-alpaca-7b/config.yml
rename to examples/lora-openllama-3b/config.yml
index 0499b265f..393942d96 100644
--- a/examples/lora-alpaca-7b/config.yml
+++ b/examples/lora-openllama-3b/config.yml
@@ -1,5 +1,5 @@
-base_model: huggyllama/llama-7b
-base_model_config: huggyllama/llama-7b
+base_model: openlm-research/open_llama_3b_600bt_preview
+base_model_config: openlm-research/open_llama_3b_600bt_preview
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 load_in_8bit: true
@@ -32,9 +32,9 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-out
-batch_size: 4
-micro_batch_size: 1
-num_epochs: 4
+batch_size: 16
+micro_batch_size: 4
+num_epochs: 3
 optimizer: adamw_bnb_8bit
 torchdistx_path:
 lr_scheduler: cosine
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 5b243bec4..de04e9333 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -211,12 +211,12 @@ def load_model(
         try:
             if is_llama_derived_model and "LlamaTokenizer" in globals():
                 tokenizer = LlamaTokenizer.from_pretrained(
-                    model,
+                    base_model_config,
                     trust_remote_code=True if cfg.trust_remote_code is True else False,
                 )
             else:
                 tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
-                    model,
+                    base_model_config,
                     trust_remote_code=True if cfg.trust_remote_code is True else False,
                 )
         except:

From 8d6a28953f409c8b23b85e96578b904adfee185b Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 12:18:28 -0400
Subject: [PATCH 17/21] fix relative path in flash-attn build:

---
 docker/Dockerfile-base | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 5d84a7c24..c63f5a496 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -43,11 +43,11 @@ RUN git clone https://github.com/HazyResearch/flash-attention.git && \
     python3 setup.py bdist_wheel && \
     cd csrc/fused_dense_lib && \
     python3 setup.py bdist_wheel && \
-    cd ../csrc/xentropy && \
+    cd ../xentropy && \
     python3 setup.py bdist_wheel && \
-    cd ../csrc/rotary && \
+    cd ../rotary && \
     python3 setup.py bdist_wheel && \
-    cd ../csrc/layer_norm && \
+    cd ../layer_norm && \
     python3 setup.py bdist_wheel
 
 FROM base-builder AS deepspeed-builder

From 004820209d5f1954f11091bbdc7c84fab77614b2 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 12:21:02 -0400
Subject: [PATCH 18/21] Update src/axolotl/prompters.py

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
---
 src/axolotl/prompters.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index df37ec85a..fd9dfc8d4 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -17,7 +17,7 @@ class AlpacaPrompter:
     system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
     prompt_style = None
 
-    def __init__(self, prompt_style="instruct"):
+    def __init__(self, prompt_style=PromptStyle.instruct.value):
         self.prompt_style = prompt_style if prompt_style else PromptStyle.instruct.value
         self.match_prompt_style()
 

From 98b1bce57e9ee96f3786b038c786baff7d399420 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 12:24:52 -0400
Subject: [PATCH 19/21] pr comments addressed

---
 examples/lora-openllama-3b/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/lora-openllama-3b/config.yml b/examples/lora-openllama-3b/config.yml
index 393942d96..6665044e0 100644
--- a/examples/lora-openllama-3b/config.yml
+++ b/examples/lora-openllama-3b/config.yml
@@ -13,7 +13,7 @@ dataset_prepared_path: last_run_prepared
 val_set_size: 0.02
 adapter: lora
 lora_model_dir:
-sequence_len: 512
+sequence_len: 256
 max_packed_sequence_len:
 lora_r: 8
 lora_alpha: 16
@@ -43,7 +43,7 @@ train_on_inputs: false
 group_by_length: false
 bf16: false
 fp16: true
-tf32: true
+tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:

From d2a6f79fd1edbac3af14679a1b44af6ace36b9a4 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 12:41:17 -0400
Subject: [PATCH 20/21] change auth token setting back

---
 src/axolotl/utils/data.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index 78f23fd52..f849765c1 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -60,12 +60,11 @@ def load_tokenized_prepared_datasets(
         else Path(default_dataset_prepared_path) / ds_hash
     )
     dataset = None
-    use_auth_token = False
     try:
         if cfg.push_dataset_to_hub:
             use_auth_token = True
             dataset = load_dataset(
-                f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=use_auth_token
+                f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True
             )
             dataset = dataset["train"]
     except:
@@ -85,7 +84,7 @@ def load_tokenized_prepared_datasets(
             ds = None
             ds_from_hub = False
             try:
-                load_dataset(d.path, streaming=True, use_auth_token=use_auth_token)
+                load_dataset(d.path, streaming=True, use_auth_token=True)
                 ds_from_hub = True
             except FileNotFoundError:
                 pass
@@ -104,7 +103,7 @@ def load_tokenized_prepared_datasets(
                         use_auth_token=use_auth_token,
                     )
                 else:
-                    ds = load_dataset(d.path, streaming=False, use_auth_token=use_auth_token)
+                    ds = load_dataset(d.path, streaming=False, use_auth_token=True)
             else:
                 fp = hf_hub_download(
                     repo_id=d.path, repo_type="dataset", filename=d.data_files

From 943961fd10da6aa0d891b1b21c326900e0a96d16 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 25 May 2023 12:42:56 -0400
Subject: [PATCH 21/21] missed ...

---
 src/axolotl/utils/data.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index f849765c1..2f9a1afec 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -62,7 +62,6 @@ def load_tokenized_prepared_datasets(
     dataset = None
     try:
         if cfg.push_dataset_to_hub:
-            use_auth_token = True
             dataset = load_dataset(
                 f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True
             )
@@ -100,7 +99,7 @@ def load_tokenized_prepared_datasets(
                         d.path,
                         streaming=False,
                         data_files=d.data_files,
-                        use_auth_token=use_auth_token,
+                        use_auth_token=True,
                     )
                 else:
                     ds = load_dataset(d.path, streaming=False, use_auth_token=True)