From a5d739b66b5fc7123d085a99a2b9e0a9dd5df92f Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 25 May 2023 11:59:08 -0400 Subject: [PATCH 1/6] fixes w/ example for super basic lora starter --- examples/lora-alpaca-7b/config.yml | 67 ++++++++++++++++++++++++++++++ src/axolotl/prompters.py | 2 +- src/axolotl/utils/data.py | 10 +++-- 3 files changed, 74 insertions(+), 5 deletions(-) create mode 100644 examples/lora-alpaca-7b/config.yml diff --git a/examples/lora-alpaca-7b/config.yml b/examples/lora-alpaca-7b/config.yml new file mode 100644 index 000000000..0499b265f --- /dev/null +++ b/examples/lora-alpaca-7b/config.yml @@ -0,0 +1,67 @@ +base_model: huggyllama/llama-7b +base_model_config: huggyllama/llama-7b +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer +load_in_8bit: true +load_in_4bit: false +strict: false +push_dataset_to_hub: +datasets: + - path: teknium/GPT4-LLM-Cleaned + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.02 +adapter: lora +lora_model_dir: +sequence_len: 512 +max_packed_sequence_len: +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.0 +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj +lora_fan_in_fan_out: +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: +output_dir: ./lora-out +batch_size: 4 +micro_batch_size: 1 +num_epochs: 4 +optimizer: adamw_bnb_8bit +torchdistx_path: +lr_scheduler: cosine +learning_rate: 0.0002 +train_on_inputs: false +group_by_length: false +bf16: false +fp16: true +tf32: true +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: true +flash_attention: +gptq_groupsize: +gptq_model_v1: +warmup_steps: 10 +eval_steps: 50 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index a6d237a11..df37ec85a 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -18,7 +18,7 @@ class AlpacaPrompter: prompt_style = None def __init__(self, prompt_style="instruct"): - self.prompt_style = prompt_style + self.prompt_style = prompt_style if prompt_style else PromptStyle.instruct.value self.match_prompt_style() def match_prompt_style(self): diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 2f9a1afec..78f23fd52 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -60,10 +60,12 @@ def load_tokenized_prepared_datasets( else Path(default_dataset_prepared_path) / ds_hash ) dataset = None + use_auth_token = False try: if cfg.push_dataset_to_hub: + use_auth_token = True dataset = load_dataset( - f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True + f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=use_auth_token ) dataset = dataset["train"] except: @@ -83,7 +85,7 @@ def load_tokenized_prepared_datasets( ds = None ds_from_hub = False try: - load_dataset(d.path, streaming=True, use_auth_token=True) + load_dataset(d.path, streaming=True, use_auth_token=use_auth_token) ds_from_hub = True except FileNotFoundError: pass @@ -99,10 +101,10 @@ def load_tokenized_prepared_datasets( d.path, streaming=False, data_files=d.data_files, - use_auth_token=True, + use_auth_token=use_auth_token, ) else: - ds = load_dataset(d.path, streaming=False, use_auth_token=True) + ds = load_dataset(d.path, streaming=False, use_auth_token=use_auth_token) else: fp = hf_hub_download( repo_id=d.path, repo_type="dataset", filename=d.data_files From e3966543199c23df068f37cce18f73defa43cdb7 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 25 May 2023 12:15:12 -0400 Subject: [PATCH 2/6] fix tokenizer loading, got openllama 3b working --- .../{lora-alpaca-7b => lora-openllama-3b}/config.yml | 10 +++++----- src/axolotl/utils/models.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) rename examples/{lora-alpaca-7b => lora-openllama-3b}/config.yml (86%) diff --git a/examples/lora-alpaca-7b/config.yml b/examples/lora-openllama-3b/config.yml similarity index 86% rename from examples/lora-alpaca-7b/config.yml rename to examples/lora-openllama-3b/config.yml index 0499b265f..393942d96 100644 --- a/examples/lora-alpaca-7b/config.yml +++ b/examples/lora-openllama-3b/config.yml @@ -1,5 +1,5 @@ -base_model: huggyllama/llama-7b -base_model_config: huggyllama/llama-7b +base_model: openlm-research/open_llama_3b_600bt_preview +base_model_config: openlm-research/open_llama_3b_600bt_preview model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: true @@ -32,9 +32,9 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-out -batch_size: 4 -micro_batch_size: 1 -num_epochs: 4 +batch_size: 16 +micro_batch_size: 4 +num_epochs: 3 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 5b243bec4..de04e9333 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -211,12 +211,12 @@ def load_model( try: if is_llama_derived_model and "LlamaTokenizer" in globals(): tokenizer = LlamaTokenizer.from_pretrained( - model, + base_model_config, trust_remote_code=True if cfg.trust_remote_code is True else False, ) else: tokenizer = getattr(transformers, tokenizer_type).from_pretrained( - model, + base_model_config, trust_remote_code=True if cfg.trust_remote_code is True else False, ) except: From 004820209d5f1954f11091bbdc7c84fab77614b2 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 25 May 2023 12:21:02 -0400 Subject: [PATCH 3/6] Update src/axolotl/prompters.py Co-authored-by: NanoCode012 --- src/axolotl/prompters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index df37ec85a..fd9dfc8d4 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -17,7 +17,7 @@ class AlpacaPrompter: system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n" prompt_style = None - def __init__(self, prompt_style="instruct"): + def __init__(self, prompt_style=PromptStyle.instruct.value): self.prompt_style = prompt_style if prompt_style else PromptStyle.instruct.value self.match_prompt_style() From 98b1bce57e9ee96f3786b038c786baff7d399420 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 25 May 2023 12:24:52 -0400 Subject: [PATCH 4/6] pr comments addressed --- examples/lora-openllama-3b/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/lora-openllama-3b/config.yml b/examples/lora-openllama-3b/config.yml index 393942d96..6665044e0 100644 --- a/examples/lora-openllama-3b/config.yml +++ b/examples/lora-openllama-3b/config.yml @@ -13,7 +13,7 @@ dataset_prepared_path: last_run_prepared val_set_size: 0.02 adapter: lora lora_model_dir: -sequence_len: 512 +sequence_len: 256 max_packed_sequence_len: lora_r: 8 lora_alpha: 16 @@ -43,7 +43,7 @@ train_on_inputs: false group_by_length: false bf16: false fp16: true -tf32: true +tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: From d2a6f79fd1edbac3af14679a1b44af6ace36b9a4 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 25 May 2023 12:41:17 -0400 Subject: [PATCH 5/6] change auth token setting back --- src/axolotl/utils/data.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 78f23fd52..f849765c1 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -60,12 +60,11 @@ def load_tokenized_prepared_datasets( else Path(default_dataset_prepared_path) / ds_hash ) dataset = None - use_auth_token = False try: if cfg.push_dataset_to_hub: use_auth_token = True dataset = load_dataset( - f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=use_auth_token + f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True ) dataset = dataset["train"] except: @@ -85,7 +84,7 @@ def load_tokenized_prepared_datasets( ds = None ds_from_hub = False try: - load_dataset(d.path, streaming=True, use_auth_token=use_auth_token) + load_dataset(d.path, streaming=True, use_auth_token=True) ds_from_hub = True except FileNotFoundError: pass @@ -104,7 +103,7 @@ def load_tokenized_prepared_datasets( use_auth_token=use_auth_token, ) else: - ds = load_dataset(d.path, streaming=False, use_auth_token=use_auth_token) + ds = load_dataset(d.path, streaming=False, use_auth_token=True) else: fp = hf_hub_download( repo_id=d.path, repo_type="dataset", filename=d.data_files From 943961fd10da6aa0d891b1b21c326900e0a96d16 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 25 May 2023 12:42:56 -0400 Subject: [PATCH 6/6] missed ... --- src/axolotl/utils/data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index f849765c1..2f9a1afec 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -62,7 +62,6 @@ def load_tokenized_prepared_datasets( dataset = None try: if cfg.push_dataset_to_hub: - use_auth_token = True dataset = load_dataset( f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True ) @@ -100,7 +99,7 @@ def load_tokenized_prepared_datasets( d.path, streaming=False, data_files=d.data_files, - use_auth_token=use_auth_token, + use_auth_token=True, ) else: ds = load_dataset(d.path, streaming=False, use_auth_token=True)