From 94f5e415a3550772ac82bcab719de8c160ecdea9 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 19 Apr 2023 17:04:34 -0400 Subject: [PATCH 1/6] various bugfixes --- configs/stability_3b.yml | 33 +++++++++++++++++++++++++++++++++ scripts/finetune.py | 2 +- src/axolotl/datasets.py | 14 +++++++++----- src/axolotl/utils/data.py | 2 +- src/axolotl/utils/models.py | 11 +++++++++-- src/axolotl/utils/trainer.py | 11 ++++++++++- 6 files changed, 63 insertions(+), 10 deletions(-) create mode 100644 configs/stability_3b.yml diff --git a/configs/stability_3b.yml b/configs/stability_3b.yml new file mode 100644 index 000000000..8cfd8fa8c --- /dev/null +++ b/configs/stability_3b.yml @@ -0,0 +1,33 @@ +base_model: stabilityai/stablelm-base-alpha-3b +load_in_8bit: true +datasets: + - path: vicgalle/alpaca-gpt4 + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.04 +adapter: +lora_model_dir: +sequence_len: 4096 +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: + - q_proj + - v_proj +lora_fan_in_fan_out: false +wandb_project: stable-llama-3b +wandb_watch: +wandb_run_id: +wandb_log_model: checkpoint +output_dir: ./stable-llama-3b +batch_size: 128 +micro_batch_size: 16 +num_epochs: 1 +learning_rate: 0.00003 +train_on_inputs: false +group_by_length: false +bf16: true +tf32: true +early_stopping_patience: 3 +resume_from_checkpoint: +local_rank: diff --git a/scripts/finetune.py b/scripts/finetune.py index 4c24a3c4f..858f33f9a 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -159,7 +159,7 @@ def train( cfg.world_size = int(os.environ.get("WORLD_SIZE", 1)) cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0)) choose_device(cfg) - cfg.ddp = cfg.world_size != 1 + cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1 if cfg.ddp: cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))} cfg.gradient_accumulation_steps = ( diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py index 862bd3229..deab5e438 100644 --- a/src/axolotl/datasets.py +++ b/src/axolotl/datasets.py @@ -1,3 +1,4 @@ +import logging from typing import List import torch @@ -92,11 +93,14 @@ class ConstantLengthDataset(IterableDataset): : self.seq_length ] labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length] - yield { - "input_ids": input_ids, - "labels": labels, - "attention_mask": attention_mask, - } + if labels.size() == input_ids.size() and attention_mask.size() == input_ids.size(): + yield { + "input_ids": input_ids, + "labels": labels, + "attention_mask": attention_mask, + } + else: + logging.warning("dropping batch due to tensor size mismatch") buffer = {"input_ids": [], "attention_mask": [], "labels": []} buffer_len = 0 diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 4e064a881..bbfa1aa18 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -65,7 +65,7 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): elif ds_from_hub: ds = load_dataset(d.path, streaming=True) else: - raise Exception("unhandled dataset load") + raise Exception(f"unhandled dataset load for {d.path}") if d.type == "alpaca": ds_strategy = AlpacaPromptTokenizingStrategy( diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 4f9bdfc0b..d05cc1927 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -102,13 +102,20 @@ def load_model( torch_dtype=torch_dtype, device_map=cfg.device_map, ) - else: + elif model_type: model = getattr(transformers, model_type).from_pretrained( base_model, load_in_8bit=cfg.load_in_8bit, torch_dtype=torch_dtype, device_map=cfg.device_map, ) + else: + model = AutoModelForCausalLM.from_pretrained( + base_model, + load_in_8bit=cfg.load_in_8bit, + torch_dtype=torch_dtype, + device_map=cfg.device_map, + ) except Exception as e: logging.error( "Exception raised attempting to load model, retrying with AutoModelForCausalLM" @@ -148,7 +155,7 @@ def load_model( model, lora_config = load_adapter(model, cfg, adapter) - if cfg.ddp: + if cfg.ddp and not load_in_8bit: model.to(f"cuda:{cfg.local_rank}") if cfg.load_4bit: diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 9f4262962..e0405357c 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -94,13 +94,22 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): ) trainer_kwargs["callbacks"] = [early_stop_cb] + data_collator_kwargs = { + "padding": True, + } + if cfg.collator_pad_to_longest: + data_collator_kwargs["padding"] = "longest" + else: + data_collator_kwargs["pad_to_multiple_of"] = 8 trainer = transformers.Trainer( model=model, train_dataset=train_dataset, eval_dataset=eval_dataset, args=training_args, data_collator=transformers.DataCollatorForSeq2Seq( - tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True + tokenizer, + return_tensors="pt", + **data_collator_kwargs, ), **trainer_kwargs, ) From 8e2a5609b3ebf3d549bbbc57c5a362835df1ee18 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 19 Apr 2023 18:13:51 -0400 Subject: [PATCH 2/6] stablelm support --- FAQS.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 FAQS.md diff --git a/FAQS.md b/FAQS.md new file mode 100644 index 000000000..7eb741482 --- /dev/null +++ b/FAQS.md @@ -0,0 +1,4 @@ +# FAQs + +- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874) +- From 8d437853c889e1ed6b3c93d6ef756d0f6e627f65 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 20 Apr 2023 09:19:46 -0400 Subject: [PATCH 3/6] fix sharegpt handling from hf, don't worry about loading llama if using earlier transformers release --- configs/llama_65B_alpaca.yml | 5 ++++- src/axolotl/prompters.py | 4 ++++ src/axolotl/utils/data.py | 14 ++++++++++++-- src/axolotl/utils/models.py | 13 +++++++++---- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/configs/llama_65B_alpaca.yml b/configs/llama_65B_alpaca.yml index 917faa97a..ab809defd 100644 --- a/configs/llama_65B_alpaca.yml +++ b/configs/llama_65B_alpaca.yml @@ -5,7 +5,8 @@ load_in_8bit: true datasets: - path: data/alpaca_data_gpt4.jsonl type: alpaca - - path: data/vicuna_cleaned.jsonl + - path: anon8231489123/ShareGPT_Vicuna_unfiltered + data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json type: sharegpt - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl type: gpteacher @@ -30,6 +31,8 @@ wandb_log_model: checkpoint output_dir: ./lora-llama-alpaca batch_size: 128 micro_batch_size: 16 +warmup_steps: 1000 +save_steps: num_epochs: 5 learning_rate: 0.00003 train_on_inputs: false diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index 070f10acb..903ee4385 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -128,6 +128,10 @@ conv_vicuna_v1_1 = Conversation( class ShareGPTPrompter: def build_prompt(self, source, tokenizer): + # ignore the system prompt if provided + if source[0]["from"] == "system": + source.pop(0) + if len(source) < 2: # If there isn't a back and forth conversation, ignore it # also happens on the data splitting leaving empty conversations diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index bbfa1aa18..081f1d851 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -3,6 +3,7 @@ from hashlib import md5 from pathlib import Path from datasets import load_from_disk, load_dataset, IterableDataset, Dataset +from huggingface_hub import hf_hub_download from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset from axolotl.prompt_tokenizers import ( @@ -50,6 +51,7 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): logging.info("Loading raw datasets...") datasets = [] for d in cfg.datasets: + ds = None ds_from_hub = False try: load_dataset(d.path, streaming=True) @@ -63,9 +65,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): "json", data_files=d.path, streaming=True, split=None ) elif ds_from_hub: - ds = load_dataset(d.path, streaming=True) + if d.data_files: + ds = load_dataset(d.path, streaming=True, data_files=d.data_files) + else: + ds = load_dataset(d.path, streaming=True) else: - raise Exception(f"unhandled dataset load for {d.path}") + fp = hf_hub_download(repo_id=d.path, repo_type="dataset", filename=d.data_files) + ds = load_dataset("json", data_files=fp, streaming=True, split=None) + if not ds: + raise Exception("unhandled dataset load") if d.type == "alpaca": ds_strategy = AlpacaPromptTokenizingStrategy( @@ -111,6 +119,8 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): seq_length=max_packed_sequence_len, ) logging.info("merging, packing, shuffling, and splitting master dataset") + # TODO don't split dataset here, shuffle and save first, then split, that way we can + # re-split when loading again dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split( test_size=cfg.val_set_size, shuffle=True, seed=42 ) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index d05cc1927..750f394b5 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -7,11 +7,16 @@ import torch import transformers from transformers import ( AutoModelForCausalLM, - LlamaForCausalLM, - LlamaTokenizer, AutoTokenizer, PreTrainedModel, ) +try: + from transformers import ( + LlamaForCausalLM, + LlamaTokenizer, + ) +except: + logging.warning("This version of transformers does not support Llama. Consider upgrading.") from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN @@ -95,7 +100,7 @@ def load_model( else True, ) load_in_8bit = False - elif is_llama_derived_model: + elif is_llama_derived_model and "LlamaForCausalLM" in globals(): model = LlamaForCausalLM.from_pretrained( base_model, load_in_8bit=cfg.load_in_8bit, @@ -130,7 +135,7 @@ def load_model( if not tokenizer: try: - if is_llama_derived_model: + if is_llama_derived_model and "LlamaTokenizer" in globals(): tokenizer = LlamaTokenizer.from_pretrained(model) else: tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model) From 4f2584f2dc39bd50aa1a0bff191457829042ac89 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 20 Apr 2023 14:39:47 -0400 Subject: [PATCH 4/6] shuffle and split dataset after save/load --- FAQS.md | 2 +- ds_config.json | 10 +++++----- src/axolotl/utils/data.py | 9 ++++----- src/axolotl/utils/models.py | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/FAQS.md b/FAQS.md index 7eb741482..bdf056be7 100644 --- a/FAQS.md +++ b/FAQS.md @@ -1,4 +1,4 @@ # FAQs - Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874) -- +- Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases diff --git a/ds_config.json b/ds_config.json index 05fc98177..ffd6f2075 100644 --- a/ds_config.json +++ b/ds_config.json @@ -11,11 +11,10 @@ "min_loss_scale": 1 }, "scheduler": { - "type": "WarmupLR", + "type": "OneCycle", "params": { - "warmup_min_lr": "auto", - "warmup_max_lr": "auto", - "warmup_num_steps": "auto" + "cycle_min_lr": 1e-7, + "cycle_max_lr": 1e-4 } }, "zero_optimization": { @@ -25,7 +24,8 @@ "allgather_bucket_size": 5e8, "contiguous_gradients": true, "reduce_bucket_size": "auto", - "reduce_scatter": true + "reduce_scatter": true, + "stage3_gather_16bit_weights_on_model_save": true }, "gradient_accumulation_steps": "auto", "gradient_clipping": "auto", diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 081f1d851..cb8ba93bf 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -119,16 +119,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): seq_length=max_packed_sequence_len, ) logging.info("merging, packing, shuffling, and splitting master dataset") - # TODO don't split dataset here, shuffle and save first, then split, that way we can - # re-split when loading again - dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split( - test_size=cfg.val_set_size, shuffle=True, seed=42 - ) + dataset = Dataset.from_list([_ for _ in constant_len_dataset]).shuffle(seed=42) if cfg.local_rank == 0: logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}") dataset.save_to_disk(prepared_ds_path) + dataset = dataset.train_test_split( + test_size=cfg.val_set_size, shuffle=False + ) train_dataset = dataset["train"] eval_dataset = dataset["test"] diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 750f394b5..eb54eba80 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -75,7 +75,7 @@ def load_model( snapshot_download_kwargs = {} if cfg.base_model_ignore_patterns: snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns - cache_model_path = Path(snapshot_download(base_model, ** snapshot_download_kwargs)) + cache_model_path = Path(snapshot_download(base_model, **snapshot_download_kwargs)) files = ( list(cache_model_path.glob("*.pt")) + list(cache_model_path.glob("*.safetensors")) From 097d367af6bb92d0ed30f35ddd6406f13dedc59a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 22 Apr 2023 16:25:23 -0400 Subject: [PATCH 5/6] tweaks to data loading, 8 bit adam, accelerate and deepspeed --- configs/llama_13B_alpaca.yml | 39 ++++++++++++++++++++++++++++++++++++ src/axolotl/utils/data.py | 29 ++++++++++++++++++--------- src/axolotl/utils/models.py | 19 ++++++++++++------ src/axolotl/utils/trainer.py | 19 +++++++++++++++--- 4 files changed, 87 insertions(+), 19 deletions(-) create mode 100644 configs/llama_13B_alpaca.yml diff --git a/configs/llama_13B_alpaca.yml b/configs/llama_13B_alpaca.yml new file mode 100644 index 000000000..420a62209 --- /dev/null +++ b/configs/llama_13B_alpaca.yml @@ -0,0 +1,39 @@ +base_model: huggyllama/llama-13b +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer +load_in_8bit: true +datasets: + - path: anon8231489123/ShareGPT_Vicuna_unfiltered + data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json + type: sharegpt +dataset_prepared_path: last_run_prepared +val_set_size: 0.002 +adapter: +lora_model_dir: +sequence_len: 2048 +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: + - q_proj + - v_proj +lora_fan_in_fan_out: false +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: checkpoint +output_dir: ./llama-13b-sharegpt +batch_size: 64 +micro_batch_size: 2 +warmup_steps: 1000 +save_steps: +eval_steps: +num_epochs: 5 +learning_rate: 0.00003 +train_on_inputs: false +group_by_length: false +bf16: true +tf32: true +early_stopping_patience: 5 +resume_from_checkpoint: +local_rank: diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index cb8ba93bf..0804312e7 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -2,7 +2,7 @@ import logging from hashlib import md5 from pathlib import Path -from datasets import load_from_disk, load_dataset, IterableDataset, Dataset +from datasets import load_from_disk, load_dataset, IterableDataset, Dataset, concatenate_datasets from huggingface_hub import hf_hub_download from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset @@ -44,10 +44,11 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): ) if any(prepared_ds_path.glob("*")): - logging.info("Loading prepared dataset from disk...") + logging.info(f"Loading prepared dataset from disk ay {prepared_ds_path}...") dataset = load_from_disk(str(prepared_ds_path)) logging.info("Prepared dataset loaded from disk...") else: + logging.info(f"Unable to find prepared dataset in {prepared_ds_path}") logging.info("Loading raw datasets...") datasets = [] for d in cfg.datasets: @@ -113,18 +114,26 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): datasets.append(ds_wrapper) else: logging.error(f"unhandled prompt tokenization strategy: {d.type}") - constant_len_dataset = ConstantLengthDataset( - tokenizer, - datasets, - seq_length=max_packed_sequence_len, - ) - logging.info("merging, packing, shuffling, and splitting master dataset") - dataset = Dataset.from_list([_ for _ in constant_len_dataset]).shuffle(seed=42) + logging.info("merging and shuffling master dataset") + dataset = concatenate_datasets(datasets).shuffle(seed=42) if cfg.local_rank == 0: - logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}") + logging.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}") dataset.save_to_disk(prepared_ds_path) + if cfg.max_packed_sequence_len is not None: + constant_len_dataset = ConstantLengthDataset( + tokenizer, + [dataset], + seq_length=max_packed_sequence_len, + ) + logging.info("packing master dataset") + dataset = Dataset.from_list([_ for _ in constant_len_dataset]) + + if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None: + logging.info(f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards") + dataset = dataset.shard(num_shards=cfg.dataset_shard_num, index=cfg.dataset_shard_idx) + dataset = dataset.train_test_split( test_size=cfg.val_set_size, shuffle=False ) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index eb54eba80..6d7cbfb52 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -101,12 +101,19 @@ def load_model( ) load_in_8bit = False elif is_llama_derived_model and "LlamaForCausalLM" in globals(): - model = LlamaForCausalLM.from_pretrained( - base_model, - load_in_8bit=cfg.load_in_8bit, - torch_dtype=torch_dtype, - device_map=cfg.device_map, - ) + if not cfg.load_in_8bit: + model = LlamaForCausalLM.from_pretrained( + base_model, + device_map=cfg.device_map, + ) + else: + model = LlamaForCausalLM.from_pretrained( + base_model, + load_in_8bit=cfg.load_in_8bit, + torch_dtype=torch_dtype, + device_map=cfg.device_map, + ) + elif model_type: model = getattr(transformers, model_type).from_pretrained( base_model, diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index e0405357c..446cc51b5 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -1,5 +1,9 @@ import math +import os +from pathlib import Path + import bitsandbytes as bnb +import torch.cuda import transformers from torch import nn from torch.optim.lr_scheduler import OneCycleLR @@ -12,7 +16,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) ) warmup_steps = cfg.warmup_steps if cfg.warmup_steps else min(int(0.03 * total_num_steps), 100) - logging_steps = max(min(int(0.005 * total_num_steps), 10), 1) + logging_steps = cfg.logging_steps if cfg.logging_steps else max(min(int(0.005 * total_num_steps), 10), 1) save_steps = eval_steps = cfg.save_steps if cfg.save_steps else min(int(0.05 * total_num_steps), 200) training_arguments_kwargs = {} @@ -26,6 +30,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): if cfg.gradient_checkpointing is not None: training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing + # deepspeed + if os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" and torch.cuda.device_count() > 1: + if cfg.deepspeed: + training_arguments_kwargs["deepspeed"] = cfg.deepspeed + else: + # make a guess here + # TODO search Path("./") for one + training_arguments_kwargs["deepspeed"] = "./ds_config.json" + training_args = transformers.TrainingArguments( per_device_train_batch_size=cfg.micro_batch_size, gradient_accumulation_steps=cfg.gradient_accumulation_steps, @@ -37,7 +50,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): save_steps=save_steps, output_dir=cfg.output_dir, save_total_limit=3, - load_best_model_at_end=True if cfg.val_set_size > 0 else False, + load_best_model_at_end=True if cfg.val_set_size > 0 and save_steps % eval_steps == 0 else False, ddp_find_unused_parameters=False if cfg.ddp else None, group_by_length=cfg.group_by_length, report_to="wandb" if cfg.use_wandb else None, @@ -47,7 +60,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): trainer_kwargs = {} - if cfg.load_in_8bit and not cfg.load_4bit: + if cfg.optimizer == "adam8bit" and not cfg.load_4bit and not "deepspeed" in training_arguments_kwargs: decay_parameters = get_parameter_names(model, [nn.LayerNorm]) decay_parameters = [name for name in decay_parameters if "bias" not in name] optimizer_grouped_parameters = [ From 4a17a4c9a19357eca3233cb2efde761f0fb761e3 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 24 Apr 2023 10:54:45 -0400 Subject: [PATCH 6/6] fix dataset handling, support galactica --- configs/galactica_1_3B.yml | 41 +++++++++++++++++++++++++++++++++++++ src/axolotl/utils/data.py | 25 ++++++++++++---------- src/axolotl/utils/models.py | 4 ++++ 3 files changed, 59 insertions(+), 11 deletions(-) create mode 100644 configs/galactica_1_3B.yml diff --git a/configs/galactica_1_3B.yml b/configs/galactica_1_3B.yml new file mode 100644 index 000000000..ed722f34e --- /dev/null +++ b/configs/galactica_1_3B.yml @@ -0,0 +1,41 @@ +base_model: facebook/galactica-1.3b +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +load_in_8bit: false +datasets: + - path: tatsu-lab/alpaca + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.1 +adapter: +lora_model_dir: +sequence_len: 1024 +max_packed_sequence_len: 1024 +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: + - q_proj + - v_proj +lora_fan_in_fan_out: false +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: checkpoint +output_dir: ./lora-llama-alpaca +batch_size: 32 +micro_batch_size: 16 +num_epochs: 3 +learning_rate: 0.00003 +train_on_inputs: false +group_by_length: false +bf16: false +tf32: false +early_stopping_patience: +resume_from_checkpoint: +local_rank: +special_tokens: + pad_token: "[PAD]" + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 0804312e7..d315da98c 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -31,7 +31,7 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): ds_hash = str( md5( ( - str(max_packed_sequence_len) + str(cfg.sequence_len) + "@" + "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets])) ).encode("utf-8") @@ -114,21 +114,24 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): datasets.append(ds_wrapper) else: logging.error(f"unhandled prompt tokenization strategy: {d.type}") - logging.info("merging and shuffling master dataset") + logging.info("tokenizing, merging, and shuffling master dataset") - dataset = concatenate_datasets(datasets).shuffle(seed=42) + samples = [] + for d in datasets: + samples = samples + [i for i in d] + dataset = Dataset.from_list(samples).shuffle(seed=42) if cfg.local_rank == 0: logging.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}") dataset.save_to_disk(prepared_ds_path) - if cfg.max_packed_sequence_len is not None: - constant_len_dataset = ConstantLengthDataset( - tokenizer, - [dataset], - seq_length=max_packed_sequence_len, - ) - logging.info("packing master dataset") - dataset = Dataset.from_list([_ for _ in constant_len_dataset]) + if cfg.max_packed_sequence_len is not None: + constant_len_dataset = ConstantLengthDataset( + tokenizer, + [dataset], + seq_length=max_packed_sequence_len, + ) + logging.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}") + dataset = Dataset.from_list([_ for _ in constant_len_dataset]) if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None: logging.info(f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards") diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 6d7cbfb52..a14f89b19 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -161,6 +161,10 @@ def load_model( tokenizer.add_special_tokens({"pad_token": "[PAD]"}) os.environ["TOKENIZERS_PARALLELISM"] = "false" + if cfg.special_tokens: + for k, v in cfg.special_tokens.items(): + setattr(tokenizer, k, v) + if load_in_8bit and not cfg.load_4bit: logging.info("converting model w/ prepare_model_for_int8_training") model = prepare_model_for_int8_training(model)