diff --git a/configs/llama_65B_alpaca.yml b/configs/llama_65B_alpaca.yml index 917faa97a..ab809defd 100644 --- a/configs/llama_65B_alpaca.yml +++ b/configs/llama_65B_alpaca.yml @@ -5,7 +5,8 @@ load_in_8bit: true datasets: - path: data/alpaca_data_gpt4.jsonl type: alpaca - - path: data/vicuna_cleaned.jsonl + - path: anon8231489123/ShareGPT_Vicuna_unfiltered + data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json type: sharegpt - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl type: gpteacher @@ -30,6 +31,8 @@ wandb_log_model: checkpoint output_dir: ./lora-llama-alpaca batch_size: 128 micro_batch_size: 16 +warmup_steps: 1000 +save_steps: num_epochs: 5 learning_rate: 0.00003 train_on_inputs: false diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index 070f10acb..903ee4385 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -128,6 +128,10 @@ conv_vicuna_v1_1 = Conversation( class ShareGPTPrompter: def build_prompt(self, source, tokenizer): + # ignore the system prompt if provided + if source[0]["from"] == "system": + source.pop(0) + if len(source) < 2: # If there isn't a back and forth conversation, ignore it # also happens on the data splitting leaving empty conversations diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index bbfa1aa18..081f1d851 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -3,6 +3,7 @@ from hashlib import md5 from pathlib import Path from datasets import load_from_disk, load_dataset, IterableDataset, Dataset +from huggingface_hub import hf_hub_download from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset from axolotl.prompt_tokenizers import ( @@ -50,6 +51,7 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): logging.info("Loading raw datasets...") datasets = [] for d in cfg.datasets: + ds = None ds_from_hub = False try: load_dataset(d.path, streaming=True) @@ -63,9 +65,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): "json", data_files=d.path, streaming=True, split=None ) elif ds_from_hub: - ds = load_dataset(d.path, streaming=True) + if d.data_files: + ds = load_dataset(d.path, streaming=True, data_files=d.data_files) + else: + ds = load_dataset(d.path, streaming=True) else: - raise Exception(f"unhandled dataset load for {d.path}") + fp = hf_hub_download(repo_id=d.path, repo_type="dataset", filename=d.data_files) + ds = load_dataset("json", data_files=fp, streaming=True, split=None) + if not ds: + raise Exception("unhandled dataset load") if d.type == "alpaca": ds_strategy = AlpacaPromptTokenizingStrategy( @@ -111,6 +119,8 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): seq_length=max_packed_sequence_len, ) logging.info("merging, packing, shuffling, and splitting master dataset") + # TODO don't split dataset here, shuffle and save first, then split, that way we can + # re-split when loading again dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split( test_size=cfg.val_set_size, shuffle=True, seed=42 ) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index d05cc1927..750f394b5 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -7,11 +7,16 @@ import torch import transformers from transformers import ( AutoModelForCausalLM, - LlamaForCausalLM, - LlamaTokenizer, AutoTokenizer, PreTrainedModel, ) +try: + from transformers import ( + LlamaForCausalLM, + LlamaTokenizer, + ) +except: + logging.warning("This version of transformers does not support Llama. Consider upgrading.") from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN @@ -95,7 +100,7 @@ def load_model( else True, ) load_in_8bit = False - elif is_llama_derived_model: + elif is_llama_derived_model and "LlamaForCausalLM" in globals(): model = LlamaForCausalLM.from_pretrained( base_model, load_in_8bit=cfg.load_in_8bit, @@ -130,7 +135,7 @@ def load_model( if not tokenizer: try: - if is_llama_derived_model: + if is_llama_derived_model and "LlamaTokenizer" in globals(): tokenizer = LlamaTokenizer.from_pretrained(model) else: tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model)