From 1edc30c786794ba2d57976c417378a0d27ced6eb Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 27 May 2023 17:57:29 -0400 Subject: [PATCH 01/55] add support for opimum bettertransformers --- configs/gpt_neox_20b.yml | 30 ++++++++++++++++++------------ requirements.txt | 1 + scripts/finetune.py | 15 +++++++++++---- src/axolotl/utils/models.py | 8 ++++++-- src/axolotl/utils/validation.py | 8 ++++++++ 5 files changed, 44 insertions(+), 18 deletions(-) diff --git a/configs/gpt_neox_20b.yml b/configs/gpt_neox_20b.yml index 730afb72c..25fdae53b 100644 --- a/configs/gpt_neox_20b.yml +++ b/configs/gpt_neox_20b.yml @@ -1,24 +1,25 @@ base_model: EleutherAI/gpt-neox-20b +base_model_config: EleutherAI/gpt-neox-20b base_model_ignore_patterns: pytorch* # prefer safetensors model_type: GPTNeoXForCausalLM tokenizer_type: AutoTokenizer -load_in_8bit: true +load_in_8bit: false +load_in_4bit: true +load_4bit: false datasets: - - path: nomic-ai/gpt4all-j-prompt-generations + - path: vicgalle/alpaca-gpt4 type: alpaca - shards: 4 - shards_index: 0 dataset_prepared_path: last_run_prepared val_set_size: 0.05 -adapter: lora +adapter: lora_model_dir: sequence_len: 2048 max_packed_sequence_len: 2048 -lora_r: 8 +lora_r: 64 lora_alpha: 32 -lora_dropout: 0.05 +lora_dropout: 0.0 lora_target_modules: - - query_key_value +lora_target_linear: true lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific wandb_project: gpt4all-neox-20b wandb_watch: @@ -26,14 +27,19 @@ wandb_run_id: wandb_log_model: output_dir: ./gpt4all-neox-20b gradient_accumulation_steps: 1 -micro_batch_size: 4 +micro_batch_size: 2 num_epochs: 5 learning_rate: 0.00003 -lr_scheduler: one_cycle +optimizer: paged_adamw_32bit +lr_scheduler: cosine train_on_inputs: false group_by_length: false -bf16: True -tf32: True +bf16: false +fp16: false +float16: true +tf32: true +flash_optimum: true early_stopping_patience: resume_from_checkpoint: local_rank: +gradient_checkpointing: true diff --git a/requirements.txt b/requirements.txt index c9123fce8..d1b2f4555 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,6 +11,7 @@ sentencepiece wandb einops xformers +optimum # qlora things bert-score==0.3.13 evaluate==0.4.0 diff --git a/scripts/finetune.py b/scripts/finetune.py index fa2dcf903..a5b5e7c85 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -6,6 +6,7 @@ import os import random import signal import sys +from functools import partial from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -19,6 +20,8 @@ from axolotl.utils.dict import DictDefault from axolotl.utils.models import load_model, load_tokenizer # add src to the pythonpath so we don't need to pip install this +from optimum.bettertransformer import BetterTransformer + from axolotl.utils.tokenization import check_dataset_labels from axolotl.utils.trainer import setup_trainer from axolotl.utils.validation import validate_config @@ -264,12 +267,14 @@ def train( # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model if cfg.local_rank == 0: + def terminate_handler(signum, frame, model): + if cfg.flash_optimum: + model = BetterTransformer.reverse(model) + model.save_pretrained(cfg.output_dir) + sys.exit(0) signal.signal( signal.SIGINT, - lambda signal, frame: ( - model.save_pretrained(cfg.output_dir), - sys.exit(0), - ), + lambda signum, frame: terminate_handler(signum, frame, model) ) logging.info("Starting trainer...") @@ -299,6 +304,8 @@ def train( # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file if cfg.local_rank == 0: + if cfg.flash_optimum: + model = BetterTransformer.reverse(model) model.save_pretrained(cfg.output_dir) # trainer.save_model(cfg.output_dir) # TODO this may be needed for deepspeed to work? need to review another time diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 1acaf6ab3..11b4629ec 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -11,7 +11,8 @@ import bitsandbytes as bnb import torch import transformers from transformers import PreTrainedModel # noqa: F401 -from transformers import ( # noqa: F401 +from optimum.bettertransformer import BetterTransformer +from transformers import ( AutoConfig, AutoModelForCausalLM, AutoTokenizer, @@ -137,7 +138,7 @@ def load_model( if cfg.bf16: torch_dtype = torch.bfloat16 - elif cfg.load_in_8bit or cfg.fp16: + elif cfg.load_in_8bit or cfg.fp16 or cfg.float16: torch_dtype = torch.float16 else: torch_dtype = torch.float32 @@ -342,6 +343,9 @@ def load_model( logging.warning("there are no parameters that require gradient updates") model.config.use_cache = False + if cfg.flash_optimum: + model = BetterTransformer.transform(model) + # TODO resume_from_checkpoint handling return model, lora_config diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index 04ffc4c1b..ba5feafe8 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -57,6 +57,14 @@ def validate_config(cfg): if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp: raise ValueError("FSDP is not supported for falcon models") + if cfg.flash_optimum is True: + if cfg.adapter: + logging.warning("BetterTransformers probably doesn't work with PEFT adapters") + if cfg.fp16 or cfg.bf16: + raise ValueError("AMP is not supported with BetterTransformer") + if cfg.float16 is not True: + logging.warning("You should probably set float16 to true") + # TODO # MPT 7b # https://github.com/facebookresearch/bitsandbytes/issues/25 From 879219979955fa2c3a2394578a8886f77e687594 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 27 May 2023 18:12:12 -0400 Subject: [PATCH 02/55] add flash attn context for efficient training and attempt setting model to train mode: --- scripts/finetune.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index a5b5e7c85..99236b087 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -252,6 +252,24 @@ def train( model.save_pretrained(cfg.output_dir) return + if cfg.debug: + logging.info("check_dataset_labels...") + check_dataset_labels( + train_dataset.select( + [random.randrange(0, len(train_dataset) - 1) for i in range(5)] + ), + tokenizer, + ) + + if prepare_ds_only: + logging.info("Finished preparing dataset. Exiting...") + return + + try: + model.train() + except: + pass + trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer) model.config.use_cache = False @@ -297,7 +315,11 @@ def train( if not Path(cfg.output_dir).is_dir(): os.makedirs(cfg.output_dir, exist_ok=True) - trainer.train(resume_from_checkpoint=resume_from_checkpoint) + if cfg.flash_optimum: + with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + trainer.train(resume_from_checkpoint=resume_from_checkpoint) + else: + trainer.train(resume_from_checkpoint=resume_from_checkpoint) logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}") From 39619028a37f4af77dd0b89c9b8191c783d7049a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 27 May 2023 19:37:24 -0400 Subject: [PATCH 03/55] use pythia-12b, neox-20b is flaky --- examples/pythia-12b/README.md | 10 ++++++++++ .../pythia-12b/config.yml | 20 +++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) create mode 100644 examples/pythia-12b/README.md rename configs/gpt_neox_20b.yml => examples/pythia-12b/config.yml (72%) diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md new file mode 100644 index 000000000..0953caa4e --- /dev/null +++ b/examples/pythia-12b/README.md @@ -0,0 +1,10 @@ +# Python 12B + +- Single-GPU A100 only (?) + +```shell +python scripts/finetune.py examples/pythia-12b/config.yml +``` + +⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️ + diff --git a/configs/gpt_neox_20b.yml b/examples/pythia-12b/config.yml similarity index 72% rename from configs/gpt_neox_20b.yml rename to examples/pythia-12b/config.yml index 25fdae53b..28e822c77 100644 --- a/configs/gpt_neox_20b.yml +++ b/examples/pythia-12b/config.yml @@ -1,11 +1,12 @@ -base_model: EleutherAI/gpt-neox-20b -base_model_config: EleutherAI/gpt-neox-20b +base_model: EleutherAI/pythia-12b-deduped +base_model_config: EleutherAI/pythia-12b-deduped base_model_ignore_patterns: pytorch* # prefer safetensors model_type: GPTNeoXForCausalLM tokenizer_type: AutoTokenizer load_in_8bit: false -load_in_4bit: true -load_4bit: false +load_in_4bit: false +gptq: false +device_map: auto datasets: - path: vicgalle/alpaca-gpt4 type: alpaca @@ -21,16 +22,16 @@ lora_dropout: 0.0 lora_target_modules: lora_target_linear: true lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific -wandb_project: gpt4all-neox-20b +wandb_project: pythia-12b wandb_watch: wandb_run_id: wandb_log_model: -output_dir: ./gpt4all-neox-20b +output_dir: ./pythia-12b gradient_accumulation_steps: 1 -micro_batch_size: 2 +micro_batch_size: 1 num_epochs: 5 learning_rate: 0.00003 -optimizer: paged_adamw_32bit +optimizer: adamw_bnb_8bit lr_scheduler: cosine train_on_inputs: false group_by_length: false @@ -43,3 +44,6 @@ early_stopping_patience: resume_from_checkpoint: local_rank: gradient_checkpointing: true +fsdp: +fsdp_transformer_layer_cls_to_wrap: +collator_pad_to_longest: true From 71a43f8479a1cef0247ceb2cc00c7c1a048ed863 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 28 May 2023 08:56:08 -0400 Subject: [PATCH 04/55] add validation/warning for bettertransformers and torch version --- src/axolotl/utils/validation.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index ba5feafe8..db19900cc 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -1,7 +1,7 @@ """Module for validating config files""" import logging - +import torch def validate_config(cfg): if cfg.gradient_accumulation_steps and cfg.batch_size: @@ -63,7 +63,10 @@ def validate_config(cfg): if cfg.fp16 or cfg.bf16: raise ValueError("AMP is not supported with BetterTransformer") if cfg.float16 is not True: - logging.warning("You should probably set float16 to true") + logging.warning("You should probably set float16 to true to load the model in float16 for BetterTransformers") + if torch.__version__.split(".")[0] < 2: + logging.warning("torch>=2.0.0 required") + raise ValueError(f"flash_optimum for BetterTransformers may not be used with {torch.__version__}") # TODO # MPT 7b From 488a67d75a4a6ccf7ed0862bbe913a356a473b0d Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 31 May 2023 16:51:19 -0400 Subject: [PATCH 05/55] experimental expansion of ctx len --- scripts/finetune.py | 44 +++++++++++++++++++++++---------------- src/axolotl/utils/data.py | 32 +++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 19 deletions(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index 99236b087..88815dfdd 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -6,22 +6,20 @@ import os import random import signal import sys -from functools import partial from pathlib import Path from typing import Any, Dict, List, Optional, Union import fire import torch import yaml -from transformers import GenerationConfig, TextStreamer - -from axolotl.utils.data import load_prepare_datasets -from axolotl.utils.dict import DictDefault -from axolotl.utils.models import load_model, load_tokenizer # add src to the pythonpath so we don't need to pip install this from optimum.bettertransformer import BetterTransformer +from transformers import GenerationConfig, TextStreamer +from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset +from axolotl.utils.dict import DictDefault +from axolotl.utils.models import load_model, load_tokenizer from axolotl.utils.tokenization import check_dataset_labels from axolotl.utils.trainer import setup_trainer from axolotl.utils.validation import validate_config @@ -204,9 +202,19 @@ def train( if check_not_in( ["inference", "shard", "merge_lora"], kwargs ): # don't need to load dataset for these - train_dataset, eval_dataset = load_prepare_datasets( - tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH - ) + if not cfg.pretraining_dataset: + train_dataset, eval_dataset = load_prepare_datasets( + tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH + ) + else: + if cfg.pretraining_dataset is True: + pretraining_dataset = "togethercomputer/RedPajama-Data-1T" + else: + pretraining_dataset = cfg.pretraining_dataset + train_dataset = load_pretraining_dataset( + pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len + ) + eval_dataset = None if cfg.debug or "debug" in kwargs: logging.info("check_dataset_labels...") @@ -256,7 +264,7 @@ def train( logging.info("check_dataset_labels...") check_dataset_labels( train_dataset.select( - [random.randrange(0, len(train_dataset) - 1) for i in range(5)] + [random.randrange(0, len(train_dataset) - 1) for i in range(5)] # nosec ), tokenizer, ) @@ -265,10 +273,7 @@ def train( logging.info("Finished preparing dataset. Exiting...") return - try: - model.train() - except: - pass + model.train() trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer) @@ -285,14 +290,15 @@ def train( # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model if cfg.local_rank == 0: - def terminate_handler(signum, frame, model): + + def terminate_handler(_, __, model): if cfg.flash_optimum: model = BetterTransformer.reverse(model) model.save_pretrained(cfg.output_dir) sys.exit(0) + signal.signal( - signal.SIGINT, - lambda signum, frame: terminate_handler(signum, frame, model) + signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model) ) logging.info("Starting trainer...") @@ -316,7 +322,9 @@ def train( if not Path(cfg.output_dir).is_dir(): os.makedirs(cfg.output_dir, exist_ok=True) if cfg.flash_optimum: - with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True): + with torch.backends.cuda.sdp_kernel( + enable_flash=True, enable_math=True, enable_mem_efficient=True + ): trainer.train(resume_from_checkpoint=resume_from_checkpoint) else: trainer.train(resume_from_checkpoint=resume_from_checkpoint) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index cba964076..49314372a 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -5,7 +5,8 @@ from hashlib import md5 from pathlib import Path from typing import List, Tuple, Union -from datasets import Dataset, DatasetDict, load_dataset, load_from_disk +import torch +from datasets import Dataset, DatasetDict, IterableDataset, load_dataset, load_from_disk from huggingface_hub import hf_hub_download from transformers import PreTrainedTokenizerBase @@ -392,3 +393,32 @@ def load_prepare_datasets( eval_dataset = dataset["test"] return train_dataset, eval_dataset + + +class PretrainingDatasetWrapper(IterableDataset): + """ + Wrapper for pretraining dataset that avoids loading the dataset into memory + """ + + def __init__(self, tokenizer, dataset_path, max_tokens=2048): + self.tokenizer = tokenizer + self.dataset_path = dataset_path + self.max_tokens = max_tokens + + def __iter__(self): + buffer = [] + for sample in load_dataset( + self.dataset_path, + name="all", + split="train", + streaming=True, + ).shuffle(buffer_size=10000): + buffer += self.tokenizer(sample["text"])["input_ids"] + buffer += [self.tokenizer.eos_token_id] + while len(buffer) > self.max_tokens: + yield torch.tensor(buffer[: self.max_tokens]) + buffer = buffer[self.max_tokens :] + + +def load_pretraining_dataset(path, tokenizer, max_tokens=2048): + return PretrainingDatasetWrapper(tokenizer, path, max_tokens=max_tokens) From 1210dc8fd5c494face7165338f1ed9f2981a2245 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 31 May 2023 21:59:15 -0400 Subject: [PATCH 06/55] more tweaks to do pre-training with bettertransformers --- scripts/finetune.py | 2 ++ src/axolotl/utils/callbacks.py | 24 ++++++++++++++++++++++++ src/axolotl/utils/data.py | 12 +++++++----- src/axolotl/utils/models.py | 4 ++-- src/axolotl/utils/trainer.py | 8 +++++++- src/axolotl/utils/validation.py | 16 ++++++++++++---- 6 files changed, 54 insertions(+), 12 deletions(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index 88815dfdd..9bed61ca4 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -14,6 +14,7 @@ import torch import yaml # add src to the pythonpath so we don't need to pip install this +from datasets import Dataset from optimum.bettertransformer import BetterTransformer from transformers import GenerationConfig, TextStreamer @@ -214,6 +215,7 @@ def train( train_dataset = load_pretraining_dataset( pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len ) + train_dataset = Dataset.from_list(list(train_dataset)) eval_dataset = None if cfg.debug or "debug" in kwargs: diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py index f6852249a..ab197304c 100644 --- a/src/axolotl/utils/callbacks.py +++ b/src/axolotl/utils/callbacks.py @@ -2,6 +2,7 @@ import os +from optimum.bettertransformer import BetterTransformer from transformers import ( TrainerCallback, TrainerControl, @@ -30,3 +31,26 @@ class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public- kwargs["model"].save_pretrained(peft_model_path) return control + + +class SaveBetterTransformerModelCallback( + TrainerCallback +): # pylint: disable=too-few-public-methods + """Callback to save the BatterTransformer wrapped model""" + + def on_save( + self, + args: TrainingArguments, + state: TrainerState, + control: TrainerControl, + **kwargs, + ): + checkpoint_folder = os.path.join( + args.output_dir, + f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", + ) + + model = BetterTransformer.reverse(kwargs["model"]) + model.save_pretrained(checkpoint_folder) + + return control diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 49314372a..164296ee2 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -409,14 +409,16 @@ class PretrainingDatasetWrapper(IterableDataset): buffer = [] for sample in load_dataset( self.dataset_path, - name="all", - split="train", - streaming=True, - ).shuffle(buffer_size=10000): + )["train"].shuffle(): buffer += self.tokenizer(sample["text"])["input_ids"] buffer += [self.tokenizer.eos_token_id] while len(buffer) > self.max_tokens: - yield torch.tensor(buffer[: self.max_tokens]) + input_ids = torch.tensor(buffer[: self.max_tokens]) + yield { + "input_ids": input_ids, + "attention_mask": torch.ones(input_ids.size()), + "labels": input_ids, + } buffer = buffer[self.max_tokens :] diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 11b4629ec..91ef96ca9 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -10,8 +10,8 @@ from typing import TYPE_CHECKING, Optional, Tuple # noqa: F401 import bitsandbytes as bnb import torch import transformers -from transformers import PreTrainedModel # noqa: F401 from optimum.bettertransformer import BetterTransformer +from transformers import PreTrainedModel # noqa: F401 from transformers import ( AutoConfig, AutoModelForCausalLM, @@ -136,7 +136,7 @@ def load_model( logging.info("patching with xpos rope") replace_llama_rope_with_xpos_rope() - if cfg.bf16: + if cfg.bf16 or cfg.bfloat16: torch_dtype = torch.bfloat16 elif cfg.load_in_8bit or cfg.fp16 or cfg.float16: torch_dtype = torch.float16 diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 9ae1e7e93..b7823fea4 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -16,7 +16,10 @@ from torch.optim.lr_scheduler import OneCycleLR from transformers import EarlyStoppingCallback, Trainer from transformers.trainer_pt_utils import get_parameter_names -from axolotl.utils.callbacks import SavePeftModelCallback +from axolotl.utils.callbacks import ( + SaveBetterTransformerModelCallback, + SavePeftModelCallback, +) from axolotl.utils.schedulers import InterpolatingLogScheduler @@ -228,6 +231,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): ]: # only save in rank 0 callbacks.append(SavePeftModelCallback) + if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: + callbacks.append(SaveBetterTransformerModelCallback) + data_collator_kwargs = { "padding": True, } diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index db19900cc..abaaba8d0 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -1,8 +1,10 @@ """Module for validating config files""" import logging + import torch + def validate_config(cfg): if cfg.gradient_accumulation_steps and cfg.batch_size: raise ValueError( @@ -59,14 +61,20 @@ def validate_config(cfg): if cfg.flash_optimum is True: if cfg.adapter: - logging.warning("BetterTransformers probably doesn't work with PEFT adapters") + logging.warning( + "BetterTransformers probably doesn't work with PEFT adapters" + ) if cfg.fp16 or cfg.bf16: raise ValueError("AMP is not supported with BetterTransformer") if cfg.float16 is not True: - logging.warning("You should probably set float16 to true to load the model in float16 for BetterTransformers") - if torch.__version__.split(".")[0] < 2: + logging.warning( + "You should probably set float16 to true to load the model in float16 for BetterTransformers" + ) + if int(torch.__version__.split(".")[0]) < 2: logging.warning("torch>=2.0.0 required") - raise ValueError(f"flash_optimum for BetterTransformers may not be used with {torch.__version__}") + raise ValueError( + f"flash_optimum for BetterTransformers may not be used with {torch.__version__}" + ) # TODO # MPT 7b From 1a82082e91127fedae540cfbc9e68ce2b3ef08a4 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 1 Jun 2023 00:33:13 -0400 Subject: [PATCH 07/55] fix bettertransformers save, force it to skip after saving correctly in callback --- src/axolotl/utils/callbacks.py | 30 +++++++++++++++++++++--------- src/axolotl/utils/trainer.py | 1 + src/axolotl/utils/validation.py | 5 +++-- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py index ab197304c..64bf48664 100644 --- a/src/axolotl/utils/callbacks.py +++ b/src/axolotl/utils/callbacks.py @@ -9,7 +9,7 @@ from transformers import ( TrainerState, TrainingArguments, ) -from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR +from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods @@ -36,21 +36,33 @@ class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public- class SaveBetterTransformerModelCallback( TrainerCallback ): # pylint: disable=too-few-public-methods - """Callback to save the BatterTransformer wrapped model""" + """Callback to save the BetterTransformer wrapped model""" - def on_save( + def on_step_end( self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs, ): - checkpoint_folder = os.path.join( - args.output_dir, - f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", - ) + # Save + if ( + args.save_strategy == IntervalStrategy.STEPS + and args.save_steps > 0 + and state.global_step % args.save_steps == 0 + ): + control.should_save = True - model = BetterTransformer.reverse(kwargs["model"]) - model.save_pretrained(checkpoint_folder) + if control.should_save: + checkpoint_folder = os.path.join( + args.output_dir, + f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", + ) + model = BetterTransformer.reverse(kwargs["model"]) + model.save_pretrained(checkpoint_folder) + + # since we're saving here, we don't need the trainer loop to attempt to save too b/c + # the trainer will raise an exception since it can't save a BetterTransformer wrapped model + control.should_save = False return control diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index b7823fea4..59b1dc803 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -232,6 +232,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): callbacks.append(SavePeftModelCallback) if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: + logging.info("Setting up SaveBetterTransformerModelCallback.") callbacks.append(SaveBetterTransformerModelCallback) data_collator_kwargs = { diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index abaaba8d0..396036621 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -66,9 +66,10 @@ def validate_config(cfg): ) if cfg.fp16 or cfg.bf16: raise ValueError("AMP is not supported with BetterTransformer") - if cfg.float16 is not True: + if cfg.float16 is not True and cfg.bloat16 is not True: logging.warning( - "You should probably set float16 to true to load the model in float16 for BetterTransformers" + "You should probably set bfloat16 or float16 to true to " + "load the model in float16 for BetterTransformers" ) if int(torch.__version__.split(".")[0]) < 2: logging.warning("torch>=2.0.0 required") From ab5cd28acfd12304201c4c184aa03a5ac3885ce2 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 1 Jun 2023 08:20:08 -0400 Subject: [PATCH 08/55] more gpt-neox long ctx fixes --- src/axolotl/utils/callbacks.py | 1 + src/axolotl/utils/data.py | 10 +++++++--- src/axolotl/utils/models.py | 6 ++++++ src/axolotl/utils/validation.py | 9 ++++++++- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/axolotl/utils/callbacks.py b/src/axolotl/utils/callbacks.py index 64bf48664..526121f2e 100644 --- a/src/axolotl/utils/callbacks.py +++ b/src/axolotl/utils/callbacks.py @@ -61,6 +61,7 @@ class SaveBetterTransformerModelCallback( model = BetterTransformer.reverse(kwargs["model"]) model.save_pretrained(checkpoint_folder) + # FIXME - need to cleanup old checkpoints # since we're saving here, we don't need the trainer loop to attempt to save too b/c # the trainer will raise an exception since it can't save a BetterTransformer wrapped model diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 164296ee2..13ad7c75d 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -388,9 +388,13 @@ def load_prepare_datasets( index=cfg.dataset_shard_idx, ) - dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False) - train_dataset = dataset["train"] - eval_dataset = dataset["test"] + if cfg.val_set_size: + dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False) + train_dataset = dataset["train"] + eval_dataset = dataset["test"] + else: + train_dataset = dataset + eval_dataset = None return train_dataset, eval_dataset diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 91ef96ca9..49a9b6f85 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -300,6 +300,12 @@ def load_model( embeddings_len = math.ceil(len(tokenizer) / 32) * 32 model.resize_token_embeddings(embeddings_len) + if cfg.sequence_len >= model.config.max_position_embeddings: + logging.warning( + f"increasing model.config.max_position_embeddings to {cfg.sequence_len}" + ) + model.config.max_position_embeddings = cfg.sequence_len + if not cfg.gptq and ( (cfg.adapter == "lora" and load_in_8bit) or (cfg.adapter == "qlora" and cfg.load_in_4bit) diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index 396036621..2e2450fba 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -80,4 +80,11 @@ def validate_config(cfg): # TODO # MPT 7b # https://github.com/facebookresearch/bitsandbytes/issues/25 - # no 8bit adamw w bf16 + # no 8bit adaAmw w bf16 + + # GPT-NeoX + # evals broken when extending context len + # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/transformers/models/gpt_neox/modeling_gpt_neox.py", line 162, in forward attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + # File "/root/miniconda3/envs/py3.9/lib/python3.9/site-packages/optimum/bettertransformer/models/attention.py", line 74, in gpt2_wrapped_scaled_dot_product + # attention_mask = causal_mask + attention_mask + # RuntimeError: The size of tensor a (2048) must match the size of tensor b (8132) at non-singleton dimension 3 From 1db46a9c720d60113ff2828ab6de219e1b857c79 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 8 Jun 2023 22:05:06 -0400 Subject: [PATCH 09/55] linting fix --- examples/pythia-12b/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md index 0953caa4e..d28d5e77d 100644 --- a/examples/pythia-12b/README.md +++ b/examples/pythia-12b/README.md @@ -7,4 +7,3 @@ python scripts/finetune.py examples/pythia-12b/config.yml ``` ⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️ - From eea2731a5ebc113e769aa2a57af9b96effed2053 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 9 Jun 2023 20:25:38 -0400 Subject: [PATCH 10/55] add streaming dataset support for pretraining datasets --- README.md | 2 + scripts/finetune.py | 23 +----- src/axolotl/utils/data.py | 136 ++++++++++++++++++++++++++------ src/axolotl/utils/validation.py | 5 ++ tests/test_validation.py | 51 ++++++++++++ 5 files changed, 171 insertions(+), 46 deletions(-) diff --git a/README.md b/README.md index de929f237..2bc55732d 100644 --- a/README.md +++ b/README.md @@ -410,6 +410,8 @@ optimizer: # specify weight decay weight_decay: +# whether to bettertransformers +flash_optimum: # whether to use xformers attention patch https://github.com/facebookresearch/xformers: xformers_attention: # whether to use flash attention patch https://github.com/HazyResearch/flash-attention: diff --git a/scripts/finetune.py b/scripts/finetune.py index 9bed61ca4..ab226f68f 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -14,7 +14,6 @@ import torch import yaml # add src to the pythonpath so we don't need to pip install this -from datasets import Dataset from optimum.bettertransformer import BetterTransformer from transformers import GenerationConfig, TextStreamer @@ -208,14 +207,11 @@ def train( tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH ) else: - if cfg.pretraining_dataset is True: - pretraining_dataset = "togethercomputer/RedPajama-Data-1T" - else: - pretraining_dataset = cfg.pretraining_dataset train_dataset = load_pretraining_dataset( - pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len + cfg.pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len ) - train_dataset = Dataset.from_list(list(train_dataset)) + # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230 + train_dataset = train_dataset.with_format("torch") eval_dataset = None if cfg.debug or "debug" in kwargs: @@ -262,19 +258,6 @@ def train( model.save_pretrained(cfg.output_dir) return - if cfg.debug: - logging.info("check_dataset_labels...") - check_dataset_labels( - train_dataset.select( - [random.randrange(0, len(train_dataset) - 1) for i in range(5)] # nosec - ), - tokenizer, - ) - - if prepare_ds_only: - logging.info("Finished preparing dataset. Exiting...") - return - model.train() trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 13ad7c75d..492d8059b 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -1,12 +1,12 @@ """Module containing data utilities""" - +import functools import logging from hashlib import md5 from pathlib import Path from typing import List, Tuple, Union import torch -from datasets import Dataset, DatasetDict, IterableDataset, load_dataset, load_from_disk +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk from huggingface_hub import hf_hub_download from transformers import PreTrainedTokenizerBase @@ -399,32 +399,116 @@ def load_prepare_datasets( return train_dataset, eval_dataset -class PretrainingDatasetWrapper(IterableDataset): - """ - Wrapper for pretraining dataset that avoids loading the dataset into memory - """ +def encode_pretraining(tokenizer, max_tokens, examples): + res = tokenizer( + examples["text"], + truncation=True, + max_length=max_tokens - 2, + add_special_tokens=True, + ) + # Convert to PyTorch tensors + input_ids = [torch.tensor(seq) for seq in res["input_ids"]] + attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]] + new_input_ids = [] + new_attention_mask = [] + # Append EOS and PAD tokens to input_ids, and correct attention_mask + for i, _ in enumerate(input_ids): + input_ids[i] = torch.cat( + ( + input_ids[i], + torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]), + ), + dim=0, + ) + attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0) - def __init__(self, tokenizer, dataset_path, max_tokens=2048): - self.tokenizer = tokenizer - self.dataset_path = dataset_path - self.max_tokens = max_tokens + # Concatenate tokens so that their lengths are less than max_tokens + buffer_input_ids = torch.tensor([], dtype=torch.long) + buffer_attention_mask = torch.tensor([], dtype=torch.long) - def __iter__(self): - buffer = [] - for sample in load_dataset( - self.dataset_path, - )["train"].shuffle(): - buffer += self.tokenizer(sample["text"])["input_ids"] - buffer += [self.tokenizer.eos_token_id] - while len(buffer) > self.max_tokens: - input_ids = torch.tensor(buffer[: self.max_tokens]) - yield { - "input_ids": input_ids, - "attention_mask": torch.ones(input_ids.size()), - "labels": input_ids, - } - buffer = buffer[self.max_tokens :] + for ids, mask in zip(input_ids, attention_mask): + if buffer_input_ids.numel() == max_tokens: + new_input_ids.append(buffer_input_ids) + new_attention_mask.append(buffer_attention_mask) + buffer_input_ids = torch.tensor([], dtype=torch.long) + buffer_attention_mask = torch.tensor([], dtype=torch.long) + buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0) + buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0) + elif buffer_input_ids.numel() + ids.numel() <= max_tokens: + buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0) + buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0) + else: + buffer_input_ids = torch.cat( + ( + buffer_input_ids, + torch.full( + (max_tokens - buffer_input_ids.numel(),), + tokenizer.pad_token_id, + dtype=torch.long, + ), + ), + dim=0, + ) + buffer_attention_mask = torch.cat( + ( + buffer_attention_mask, + torch.full( + (max_tokens - buffer_attention_mask.numel(),), + 0, + dtype=torch.long, + ), + ), + dim=0, + ) + new_input_ids.append(buffer_input_ids) + new_attention_mask.append(buffer_attention_mask) + buffer_input_ids = torch.tensor([], dtype=torch.long) + buffer_attention_mask = torch.tensor([], dtype=torch.long) + + buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0) + buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0) + + if buffer_input_ids.numel() > 0: # for any leftover tokens + while buffer_input_ids.numel() < max_tokens: # make all sequences equal in size + buffer_input_ids = torch.cat( + ( + buffer_input_ids, + torch.full( + (max_tokens - buffer_input_ids.numel(),), + tokenizer.pad_token_id, + dtype=torch.long, + ), + ), + dim=0, + ) + buffer_attention_mask = torch.cat( + ( + buffer_attention_mask, + torch.full( + (max_tokens - buffer_attention_mask.numel(),), + 0, + dtype=torch.long, + ), + ), + dim=0, + ) + new_input_ids.append(buffer_input_ids) + new_attention_mask.append(buffer_attention_mask) + + ret = { + "input_ids": [seq.tolist() for seq in new_input_ids], + "labels": [seq.tolist() for seq in new_input_ids], + "attention_mask": [seq.tolist() for seq in new_attention_mask], + } + + logging.debug(len(ret["input_ids"])) + return ret def load_pretraining_dataset(path, tokenizer, max_tokens=2048): - return PretrainingDatasetWrapper(tokenizer, path, max_tokens=max_tokens) + encode = functools.partial(encode_pretraining, tokenizer, max_tokens) + dataset = load_dataset(path, streaming=True, split="train") + dataset = dataset.shuffle(seed=42, buffer_size=10_000) + # TODO dynamically figure out which columns/features to remove + dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"]) + return dataset diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index 2e2450fba..603afbfee 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -77,6 +77,11 @@ def validate_config(cfg): f"flash_optimum for BetterTransformers may not be used with {torch.__version__}" ) + if cfg.pretraining_dataset and cfg.group_by_length: + logging.warning( + "You probably want to disable group_by_length as it will force a streamed dataset to download completely." + ) + # TODO # MPT 7b # https://github.com/facebookresearch/bitsandbytes/issues/25 diff --git a/tests/test_validation.py b/tests/test_validation.py index 50bdf37e6..575392ab4 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -198,3 +198,54 @@ class ValidationTest(unittest.TestCase): ) validate_config(cfg) + + def test_flash_optimum(self): + cfg = DictDefault( + { + "flash_optimum": True, + "adapter": "lora", + } + ) + + with self._caplog.at_level(logging.WARNING): + validate_config(cfg) + assert any( + "BetterTransformers probably doesn't work with PEFT adapters" + in record.message + for record in self._caplog.records + ) + + cfg = DictDefault( + { + "flash_optimum": True, + } + ) + + with self._caplog.at_level(logging.WARNING): + validate_config(cfg) + assert any( + "probably set bfloat16 or float16" in record.message + for record in self._caplog.records + ) + + cfg = DictDefault( + { + "flash_optimum": True, + "fp16": True, + } + ) + regex_exp = r".*AMP is not supported.*" + + with pytest.raises(ValueError, match=regex_exp): + validate_config(cfg) + + cfg = DictDefault( + { + "flash_optimum": True, + "bf16": True, + } + ) + regex_exp = r".*AMP is not supported.*" + + with pytest.raises(ValueError, match=regex_exp): + validate_config(cfg) From 0c6f928601ac289f7d4b513855feab5047cd7a5a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 10 Jun 2023 14:21:43 -0400 Subject: [PATCH 11/55] address PR feedback --- examples/pythia-12b/README.md | 2 +- examples/pythia-12b/config.yml | 4 ++-- scripts/finetune.py | 5 ++++- src/axolotl/utils/data.py | 4 ++-- src/axolotl/utils/trainer.py | 2 -- 5 files changed, 9 insertions(+), 8 deletions(-) diff --git a/examples/pythia-12b/README.md b/examples/pythia-12b/README.md index d28d5e77d..123ffa710 100644 --- a/examples/pythia-12b/README.md +++ b/examples/pythia-12b/README.md @@ -1,4 +1,4 @@ -# Python 12B +# Pythia 12B - Single-GPU A100 only (?) diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml index 28e822c77..3b3d91630 100644 --- a/examples/pythia-12b/config.yml +++ b/examples/pythia-12b/config.yml @@ -22,7 +22,7 @@ lora_dropout: 0.0 lora_target_modules: lora_target_linear: true lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific -wandb_project: pythia-12b +wandb_project: wandb_watch: wandb_run_id: wandb_log_model: @@ -45,5 +45,5 @@ resume_from_checkpoint: local_rank: gradient_checkpointing: true fsdp: -fsdp_transformer_layer_cls_to_wrap: +fsdp_config: collator_pad_to_longest: true diff --git a/scripts/finetune.py b/scripts/finetune.py index ab226f68f..47aada411 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -208,7 +208,10 @@ def train( ) else: train_dataset = load_pretraining_dataset( - cfg.pretraining_dataset, tokenizer, max_tokens=cfg.sequence_len + cfg.pretraining_dataset, + tokenizer, + max_tokens=cfg.sequence_len, + seed=cfg.seed, ) # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230 train_dataset = train_dataset.with_format("torch") diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index 492d8059b..058c24bcd 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -505,10 +505,10 @@ def encode_pretraining(tokenizer, max_tokens, examples): return ret -def load_pretraining_dataset(path, tokenizer, max_tokens=2048): +def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42): encode = functools.partial(encode_pretraining, tokenizer, max_tokens) dataset = load_dataset(path, streaming=True, split="train") - dataset = dataset.shuffle(seed=42, buffer_size=10_000) + dataset = dataset.shuffle(seed=seed, buffer_size=10_000) # TODO dynamically figure out which columns/features to remove dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"]) return dataset diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 59b1dc803..57a08aa53 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -1,7 +1,6 @@ """Module containing the Trainer class and related functions""" import importlib -import logging import math import os import sys @@ -232,7 +231,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): callbacks.append(SavePeftModelCallback) if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: - logging.info("Setting up SaveBetterTransformerModelCallback.") callbacks.append(SaveBetterTransformerModelCallback) data_collator_kwargs = { From 759e8673ce497125da5855a173fd80f57bb071b3 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 10 Jun 2023 14:25:21 -0400 Subject: [PATCH 12/55] Update scripts/finetune.py Co-authored-by: NanoCode012 --- scripts/finetune.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index 47aada411..cd9234334 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -261,7 +261,6 @@ def train( model.save_pretrained(cfg.output_dir) return - model.train() trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer) From 958da703762b7759eabdaa6fd7fad231228e1ad9 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 10 Jun 2023 15:28:08 -0400 Subject: [PATCH 13/55] fix formatting --- scripts/finetune.py | 1 - src/axolotl/utils/trainer.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index cd9234334..2f6bef3ef 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -261,7 +261,6 @@ def train( model.save_pretrained(cfg.output_dir) return - trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer) model.config.use_cache = False diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 57a08aa53..b7823fea4 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -1,6 +1,7 @@ """Module containing the Trainer class and related functions""" import importlib +import logging import math import os import sys From c9a149f9e8bacdcd59a9e6de435499b2f4a845c1 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 11 Jun 2023 10:11:17 -0400 Subject: [PATCH 14/55] add check for attr --- src/axolotl/utils/models.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 49a9b6f85..532fa5518 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -300,7 +300,10 @@ def load_model( embeddings_len = math.ceil(len(tokenizer) / 32) * 32 model.resize_token_embeddings(embeddings_len) - if cfg.sequence_len >= model.config.max_position_embeddings: + if ( + hasattr(model.config, "max_position_embeddings") + and cfg.sequence_len >= model.config.max_position_embeddings + ): logging.warning( f"increasing model.config.max_position_embeddings to {cfg.sequence_len}" ) From 2ba4ae8f461c0c491f9ca303c134f9ad6f725e8c Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 12 Jun 2023 10:07:18 -0400 Subject: [PATCH 15/55] tweak config to work --- examples/openllama-3b/config.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index 6fd704ffc..4372876eb 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -26,17 +26,18 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./openllama-out -batch_size: 16 -micro_batch_size: 4 +gradient_accumulation_steps: 1 +micro_batch_size: 1 num_epochs: 3 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine -learning_rate: 0.0002 +learning_rate: 0.00001 train_on_inputs: false group_by_length: false +float16: true bf16: false -fp16: true +fp16: false tf32: false gradient_checkpointing: true early_stopping_patience: @@ -52,7 +53,7 @@ eval_steps: 50 save_steps: debug: deepspeed: -weight_decay: 0.0 +weight_decay: 0.1 fsdp: fsdp_config: special_tokens: From 34ae69989f1ce1cf4fdf53f0b55c537927dc4b9a Mon Sep 17 00:00:00 2001 From: mhenrichsen Date: Mon, 12 Jun 2023 21:39:19 +0200 Subject: [PATCH 16/55] fix inference --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 214bfd14d..3126d81f3 100644 --- a/README.md +++ b/README.md @@ -500,16 +500,16 @@ Pass the appropriate flag to the train command: - Pretrained LORA: ```bash - --inference --lora_model_dir ./completed-model + --inference --lora_model_dir="./lora-output-dir" ``` - Full weights finetune: ```bash - --inference --base_model ./completed-model + --inference --base_model="./completed-model" ``` - Full weights finetune w/ a prompt from a text file: ```bash cat /tmp/prompt.txt | python scripts/finetune.py configs/your_config.yml \ - --base_model ./completed-model --inference --prompter=None --load_in_8bit=True + --base_model="./completed-model" --inference --prompter=None --load_in_8bit=True ``` ### Merge LORA to base From 4b43a66a0b2e902ecfa49ab932e8df292e5e53dd Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 12 Jun 2023 18:38:38 -0400 Subject: [PATCH 17/55] update alpaca_chat prompts for instructions to explainn the conversation --- src/axolotl/prompt_strategies/alpaca_chat.py | 23 +++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py index 0f8c31d6a..1183c1e8e 100644 --- a/src/axolotl/prompt_strategies/alpaca_chat.py +++ b/src/axolotl/prompt_strategies/alpaca_chat.py @@ -20,11 +20,24 @@ def load(tokenizer, cfg): class AlpacaConcisePrompter(AlpacaPrompter): """ - Alpaca Prompter extending the system prompt to ask for concise answers + Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers """ - system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that concisely and appropriately completes the request.\n\n" - system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately and concisely completes the request.\n\n" + system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n" + system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n" + + +class AlpacaChatPrompter(AlpacaPrompter): + """ + Alpaca Chat Prompter extending the system prompt to for chat-instruct answers + """ + + system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n" + system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n" + + def __init__(self): # pylint: disable=super-init-not-called + self.prompt_style = PromptStyle.CHAT.value + self.match_prompt_style() class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): @@ -64,7 +77,7 @@ def load_concise(tokenizer, cfg): def load_qa(tokenizer, cfg): return AlpacaQAPromptTokenizingStrategy( - AlpacaPrompter(PromptStyle.CHAT.value), + AlpacaChatPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, @@ -73,7 +86,7 @@ def load_qa(tokenizer, cfg): def load_camel_ai(tokenizer, cfg): return CamelAIPromptTokenizingStrategy( - AlpacaPrompter(PromptStyle.CHAT.value), + AlpacaChatPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len, From dc77c8ebce8ec4135f4e0c03a9d336b3f0957358 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Tue, 13 Jun 2023 12:01:46 +0900 Subject: [PATCH 18/55] chore: Refactor inf_kwargs out --- scripts/finetune.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/finetune.py b/scripts/finetune.py index 283100c8a..785f3cf23 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -63,7 +63,7 @@ def get_multi_line_input() -> Optional[str]: return instruction -def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"): +def do_inference(cfg, model, tokenizer, prompter: Optional[str]): default_tokens = {"unk_token": "", "bos_token": "", "eos_token": ""} for token, symbol in default_tokens.items(): @@ -257,13 +257,13 @@ def train( if cfg.inference: logging.info("calling do_inference function") - inf_kwargs: Dict[str, Any] = {} + prompter: Optional[str] = "AlpacaPrompter" if "prompter" in kwargs: if kwargs["prompter"] == "None": - inf_kwargs["prompter"] = None + prompter = None else: - inf_kwargs["prompter"] = kwargs["prompter"] - do_inference(cfg, model, tokenizer, **inf_kwargs) + prompter = kwargs["prompter"] + do_inference(cfg, model, tokenizer, prompter=prompter) return if "shard" in kwargs: From 5ff547dc703e7dfc09e56baf5fe2749e56076961 Mon Sep 17 00:00:00 2001 From: PocketDoc Labs Date: Mon, 12 Jun 2023 22:38:10 -0700 Subject: [PATCH 19/55] Update README.md to include a community showcase --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 3126d81f3..c4e1887e9 100644 --- a/README.md +++ b/README.md @@ -552,6 +552,16 @@ Building something cool with Axolotl? Consider adding a badge to your model card [Built with Axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) +## Community Showcase + +Open Access AI Collective +- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b) +- [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b) +- [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat) + +PocketDoc Labs +- [Dan's PersonalityEngine 13b LoRA](https://huggingface.co/PocketDoc/Dans-PersonalityEngine-13b-LoRA) + ## Contributing 🤝 Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new). From 3513885f434a1668754883adc3a050fe658c4d8f Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Wed, 14 Jun 2023 01:10:58 +0900 Subject: [PATCH 20/55] Fix sharegpt type --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c4e1887e9..5a00cccac 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,7 @@ Have dataset(s) in one of the following format (JSONL recommended): ```json {"instruction": "...", "input": "...", "output": "..."} ``` -- `sharegpt`: conversations +- `sharegpt:chat`: conversations ```json {"conversations": [{"from": "...", "value": "..."}]} ``` From 556fe408b3ac9117b825705d5f08982377377dd8 Mon Sep 17 00:00:00 2001 From: "maciej.karasek" <103371156+MaciejKarasek@users.noreply.github.com> Date: Wed, 14 Jun 2023 16:59:57 +0200 Subject: [PATCH 21/55] issue #205 bugfix --- src/axolotl/utils/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 05acfce93..103c707f2 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -252,11 +252,11 @@ def load_model( ) # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this # when training starts - if hasattr(config, "max_seq_len") and cfg.sequence_len > config.max_seq_len: + if hasattr(config, "max_seq_len") and config.max_seq_len and cfg.sequence_len > config.max_seq_len: config.max_seq_len = cfg.sequence_len logging.warning(f"increasing context length to {cfg.sequence_len}") elif ( - hasattr(config, "max_sequence_length") + hasattr(config, "max_sequence_length") and config.max_sequence_length and cfg.sequence_len > config.max_sequence_length ): config.max_sequence_length = cfg.sequence_len @@ -289,7 +289,7 @@ def load_model( model.resize_token_embeddings(embeddings_len) if ( - hasattr(model.config, "max_position_embeddings") + hasattr(model.config, "max_position_embeddings") and model.config.max_position_embeddings and cfg.sequence_len >= model.config.max_position_embeddings ): logging.warning( From 136522f9c9bbb4658f9ebaa5f528366d9c15b2ae Mon Sep 17 00:00:00 2001 From: "maciej.karasek" <103371156+MaciejKarasek@users.noreply.github.com> Date: Wed, 14 Jun 2023 20:02:09 +0200 Subject: [PATCH 22/55] style correction --- src/axolotl/utils/models.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 103c707f2..c6d380267 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -252,11 +252,16 @@ def load_model( ) # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this # when training starts - if hasattr(config, "max_seq_len") and config.max_seq_len and cfg.sequence_len > config.max_seq_len: + if ( + hasattr(config, "max_seq_len") + and config.max_seq_len + and cfg.sequence_len > config.max_seq_len + ): config.max_seq_len = cfg.sequence_len logging.warning(f"increasing context length to {cfg.sequence_len}") elif ( - hasattr(config, "max_sequence_length") and config.max_sequence_length + hasattr(config, "max_sequence_length") + and config.max_sequence_length and cfg.sequence_len > config.max_sequence_length ): config.max_sequence_length = cfg.sequence_len @@ -289,7 +294,8 @@ def load_model( model.resize_token_embeddings(embeddings_len) if ( - hasattr(model.config, "max_position_embeddings") and model.config.max_position_embeddings + hasattr(model.config, "max_position_embeddings") + and model.config.max_position_embeddings and cfg.sequence_len >= model.config.max_position_embeddings ): logging.warning( From 945c4191a33753fee06d04b7ab3005df91b0feaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Steffen=20R=C3=B6cker?= Date: Wed, 14 Jun 2023 20:09:26 +0200 Subject: [PATCH 23/55] Use AutoTokenizer for redpajama example --- examples/redpajama/config-3b.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/redpajama/config-3b.yml b/examples/redpajama/config-3b.yml index e7342b2f7..869c0883e 100644 --- a/examples/redpajama/config-3b.yml +++ b/examples/redpajama/config-3b.yml @@ -1,7 +1,7 @@ base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1 base_model_config: togethercomputer/RedPajama-INCITE-Chat-3B-v1 model_type: GPTNeoXForCausalLM -tokenizer_type: GPTNeoXTokenizer +tokenizer_type: AutoTokenizer trust_remote_code: load_in_8bit: false datasets: From 7925ddce866daa03b6df9b044b1b8f4222fd5edf Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 01:59:33 -0400 Subject: [PATCH 24/55] bugfix for potential off by one --- src/axolotl/prompt_strategies/alpaca_chat.py | 12 +++++ src/axolotl/prompt_tokenizers.py | 32 +++++++------- tests/test_prompt_tokenizers.py | 46 ++++++++++++++++++-- 3 files changed, 72 insertions(+), 18 deletions(-) diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py index 1183c1e8e..6161d7e37 100644 --- a/src/axolotl/prompt_strategies/alpaca_chat.py +++ b/src/axolotl/prompt_strategies/alpaca_chat.py @@ -40,6 +40,18 @@ class AlpacaChatPrompter(AlpacaPrompter): self.match_prompt_style() +class NoSystemPrompter(AlpacaPrompter): + """ + Null Prompter with no system prompts + """ + + prompt_input = "{instruction} {input} " + prompt_no_input = "{instruction} " + + def __init__(self): # pylint: disable=super-init-not-called + pass + + class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): """ Tokenizing strategy for AlpacaQA diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index 8b3c88fee..6408620d7 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -96,25 +96,27 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy): input, # pylint: disable=redefined-builtin response, ) = self.parse_instruction_fields(prompt) - full_prompt = self._build_full_prompt(instruction, input, response) - tokenized_full_prompt = self._tokenize(full_prompt) - if not self.train_on_inputs: - user_prompt = next( - iter( - self.prompter.build_prompt( - instruction, - input, - ) + user_prompt = next( + iter( + self.prompter.build_prompt( + instruction, + input, ) ) - tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False) - user_prompt_len = len(tokenized_user_prompt["input_ids"]) + ) + tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False) + if not self.train_on_inputs: + user_prompt_len = len(tokenized_prompt["input_ids"]) # TODO this could be sped up using numpy array slicing - tokenized_full_prompt["labels"] = [ - -100 - ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:] + tokenized_prompt["labels"] = [-100] * user_prompt_len + tokenized_res_prompt = self._tokenize( + response, strip_bos_token=True, add_eos_token=True + ) + tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"] + tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"] + tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"] - return tokenized_full_prompt + return tokenized_prompt def _build_full_prompt( self, instruction, input, response # pylint: disable=redefined-builtin diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py index 89209e84f..abc746bbf 100644 --- a/tests/test_prompt_tokenizers.py +++ b/tests/test_prompt_tokenizers.py @@ -6,8 +6,12 @@ from pathlib import Path from transformers import AutoTokenizer -from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy -from axolotl.prompters import ShareGPTPrompter +from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter +from axolotl.prompt_tokenizers import ( + AlpacaPromptTokenizingStrategy, + ShareGPTPromptTokenizingStrategy, +) +from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter logging.basicConfig(level="INFO") @@ -29,7 +33,6 @@ class TestPromptTokenizationStrategies(unittest.TestCase): ) def test_sharegpt_integration(self): - print(Path(__file__).parent) with open( Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8" ) as fin: @@ -53,6 +56,43 @@ class TestPromptTokenizationStrategies(unittest.TestCase): self.assertEqual(len(example[fields]), len(tokenized_conversation[fields])) self.assertEqual(example[fields], tokenized_conversation[fields]) + def test_completion(self): + """ + tests the interface between the user and assistant parts + """ + prompter = NoSystemPrompter() + strat = AlpacaPromptTokenizingStrategy( + prompter, + self.tokenizer, + False, + 2048, + ) + sample = { + "instruction": "hello cruel. lorem ipsum dolor sit amet.", + "output": "world!", + } + example = strat.tokenize_prompt(sample) + world_idx = example["input_ids"].index(3186) + assert example["labels"][world_idx] == 3186 + assert example["labels"][world_idx - 1] == -100 + + def test_alpaca(self): + """ + tests the interface between the user and assistant parts + """ + prompter = AlpacaPrompter() + strat = AlpacaPromptTokenizingStrategy( + prompter, + self.tokenizer, + False, + 2048, + ) + sample = {"instruction": "hello!", "output": "Hi! How can I help?"} + example = strat.tokenize_prompt(sample) + world_idx = example["input_ids"].index(6324) + assert example["labels"][world_idx] == 6324 + assert example["labels"][world_idx - 1] == -100 + if __name__ == "__main__": unittest.main() From baed440fa16552ea32bebfea30c389fcadda6d33 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 02:03:53 -0400 Subject: [PATCH 25/55] ingore duplicate code in tests --- tests/test_prompt_tokenizers.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py index abc746bbf..8d9635c0e 100644 --- a/tests/test_prompt_tokenizers.py +++ b/tests/test_prompt_tokenizers.py @@ -61,6 +61,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase): tests the interface between the user and assistant parts """ prompter = NoSystemPrompter() + # pylint: disable=duplicate-code strat = AlpacaPromptTokenizingStrategy( prompter, self.tokenizer, @@ -80,6 +81,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase): """ tests the interface between the user and assistant parts """ + # pylint: disable=duplicate-code prompter = AlpacaPrompter() strat = AlpacaPromptTokenizingStrategy( prompter, From 88e17ffc500173d8b6baae50195409edfc9a10ea Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 00:26:44 -0400 Subject: [PATCH 26/55] add float16 docs and tweak typehints --- README.md | 8 ++++++++ src/axolotl/utils/models.py | 8 +++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e267a9d6d..225ef0dd7 100644 --- a/README.md +++ b/README.md @@ -264,6 +264,8 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic bf16: true # require >=ampere fp16: true tf32: true # require >=ampere + bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP + float16: true # use instead of fp16 when you don't want AMP ``` Note: Repo does not do 4-bit quantization. @@ -522,6 +524,12 @@ Add below flag to train command above --merge_lora --lora_model_dir="./completed-model" --load_in_8bit=False --load_in_4bit=False ``` +If you run out of CUDA memory, you can try to merge in system RAM with + +```bash +CUDA_VISIBLE_DEVICES="" python3 scripts/finetune.py ... +``` + ## Common Errors 🧰 > Cuda out of memory diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index c6d380267..2ae9a26aa 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -11,13 +11,14 @@ import bitsandbytes as bnb import torch import transformers from optimum.bettertransformer import BetterTransformer -from transformers import PreTrainedModel # noqa: F401 -from transformers import ( +from transformers import ( # noqa: F401 AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, LlamaConfig, + PreTrainedModel, + PreTrainedTokenizerBase, ) from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN @@ -71,7 +72,7 @@ def load_tokenizer( def load_model( base_model, base_model_config, model_type, tokenizer, cfg, adapter="lora" ): - # type: (str, str, str, AutoTokenizer, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]] + # type: (str, str, str, PreTrainedTokenizerBase, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]] """ Load a model from a base model and a model type. """ @@ -284,6 +285,7 @@ def load_model( model = AutoModelForCausalLM.from_pretrained( base_model, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, + load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None, torch_dtype=torch_dtype, device_map=cfg.device_map, trust_remote_code=cfg.trust_remote_code or False, From d7635b71486c65629f2ec1e4fe8c70396366aa96 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 02:06:27 -0400 Subject: [PATCH 27/55] hint to what AMP means --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 225ef0dd7..d6c9cfefb 100644 --- a/README.md +++ b/README.md @@ -264,7 +264,7 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic bf16: true # require >=ampere fp16: true tf32: true # require >=ampere - bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP + bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision) float16: true # use instead of fp16 when you don't want AMP ``` Note: Repo does not do 4-bit quantization. From 1ab3bf3e6772be2165a8504430c61d0d1b55e32f Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 02:09:33 -0400 Subject: [PATCH 28/55] fix test name --- tests/test_prompt_tokenizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py index 8d9635c0e..aba340eee 100644 --- a/tests/test_prompt_tokenizers.py +++ b/tests/test_prompt_tokenizers.py @@ -56,7 +56,7 @@ class TestPromptTokenizationStrategies(unittest.TestCase): self.assertEqual(len(example[fields]), len(tokenized_conversation[fields])) self.assertEqual(example[fields], tokenized_conversation[fields]) - def test_completion(self): + def test_no_sys_prompt(self): """ tests the interface between the user and assistant parts """ From 6d0ee4ba34fbf20e9846ce24875448019f8dba65 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 08:40:41 -0400 Subject: [PATCH 29/55] support adamw and grad norm hyperparams --- src/axolotl/utils/trainer.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 5152e649b..5cf3107f3 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -115,6 +115,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): # TODO search Path("./") for one training_arguments_kwargs["deepspeed"] = "./ds_config.json" + if cfg.adam_beta1: + training_arguments_kwargs["adam_beta1"] = cfg.adam_beta1 + if cfg.adam_beta2: + training_arguments_kwargs["adam_beta2"] = cfg.adam_beta2 + if cfg.adam_epsilon: + training_arguments_kwargs["adam_epsilon"] = cfg.adam_epsilon + if cfg.max_grad_norm: + training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm + training_args = transformers.TrainingArguments( per_device_train_batch_size=cfg.micro_batch_size, per_device_eval_batch_size=cfg.eval_batch_size From c969f0a9dc28c9f095a2bb6b3ecede0216d909b5 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 08:43:20 -0400 Subject: [PATCH 30/55] add docs --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index d6c9cfefb..5fbac1a48 100644 --- a/README.md +++ b/README.md @@ -422,6 +422,12 @@ log_sweep_max_lr: optimizer: # specify weight decay weight_decay: +# adamw hyperparams +adam_beta1: +adam_beta2: +adam_epsilon: +# Gradient clipping max norm +max_grad_norm: # whether to bettertransformers flash_optimum: From cb9d3af5c00e0189f95c03d64efdc283aec54679 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 09:39:42 -0400 Subject: [PATCH 31/55] add validation and tests for adamw hyperparam --- src/axolotl/utils/validation.py | 5 ++++ tests/test_validation.py | 42 +++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index 298d36c4e..2e0da69b3 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -87,6 +87,11 @@ def validate_config(cfg): "You probably want to disable group_by_length as it will force a streamed dataset to download completely." ) + if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and ( + not cfg.optimizer or "adamw" not in cfg.optimizer + ): + logging.warning("adamw hyperparameters found, but no adamw optimizer set") + # TODO # MPT 7b # https://github.com/facebookresearch/bitsandbytes/issues/25 diff --git a/tests/test_validation.py b/tests/test_validation.py index dba54586e..cc6d29a23 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -263,3 +263,45 @@ class ValidationTest(unittest.TestCase): with pytest.raises(ValueError, match=regex_exp): validate_config(cfg) + + def test_adamw_hyperparams(self): + cfg = DictDefault( + { + "optimizer": None, + "adamw_epsilon": 0.0001, + } + ) + + with self._caplog.at_level(logging.WARNING): + validate_config(cfg) + assert any( + "adamw hyperparameters found, but no adamw optimizer set" + in record.message + for record in self._caplog.records + ) + + cfg = DictDefault( + { + "optimizer": "adafactor", + "adamw_beta1": 0.0001, + } + ) + + with self._caplog.at_level(logging.WARNING): + validate_config(cfg) + assert any( + "adamw hyperparameters found, but no adamw optimizer set" + in record.message + for record in self._caplog.records + ) + + cfg = DictDefault( + { + "optimizer": "adamw_bnb_8bit", + "adamw_beta1": 0.0001, + "adamw_beta2": 0.0001, + "adamw_epsilon": 0.0001, + } + ) + + validate_config(cfg) From ad5ca4f734721d66b9c10a58ba7141bf13694452 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 10:12:47 -0400 Subject: [PATCH 32/55] Additional test case per pr --- tests/test_validation.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_validation.py b/tests/test_validation.py index cc6d29a23..d39a4618e 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -305,3 +305,11 @@ class ValidationTest(unittest.TestCase): ) validate_config(cfg) + + cfg = DictDefault( + { + "optimizer": "adafactor", + } + ) + + validate_config(cfg) From d35278aaf1b5829747ee8dbc1952c357bc4d1c6b Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 15 Jun 2023 16:01:27 -0400 Subject: [PATCH 33/55] don't fail fast --- .github/workflows/base.yml | 1 + .github/workflows/main.yml | 1 + .github/workflows/tests.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index c5a70978b..623083db2 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -12,6 +12,7 @@ jobs: # this job needs to be run on self-hosted GPU runners... runs-on: self-hosted strategy: + fail-fast: false matrix: include: - cuda: "118" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4e7705b7d..033199154 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,6 +11,7 @@ jobs: if: github.repository_owner == 'OpenAccess-AI-Collective' # this job needs to be run on self-hosted GPU runners... strategy: + fail-fast: false matrix: include: - cuda: cu118 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0fc7ac9d9..d5184def6 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,6 +7,7 @@ jobs: test: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: python_version: ["3.9", "3.10"] timeout-minutes: 10 From 9bdd30cdfdfad725b03620fdb933689fe1b828d5 Mon Sep 17 00:00:00 2001 From: Utensil Date: Wed, 21 Jun 2023 08:00:58 +0000 Subject: [PATCH 34/55] Support loading data files from a local directory ref: https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path --- src/axolotl/utils/data.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index c36bfcee9..eed7d6db1 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets( pass # prefer local dataset, even if hub exists - if Path(d.path).exists(): - ds = load_dataset( - "json", - data_files=d.path, - streaming=False, - split=None, - ) + local_path = Path(d.path) + if local_path.exists(): + if local_path.is_dir(): + ds = load_dataset( + d.path, + data_files=d.data_files, + streaming=False, + split=None, + ) + elif local_path.is_file(): + ds = load_dataset( + "json", + data_files=d.path, + streaming=False, + split=None, + ) + else: + raise ValueError( + "unhandled dataset load: local path exists, but is neither a directory or a file" + ) elif ds_from_hub: if d.data_files: ds = load_dataset( From 0aeb7c7802fa59586860035e9bbff9f25aabb211 Mon Sep 17 00:00:00 2001 From: Mahesh Sinha Date: Wed, 21 Jun 2023 15:34:48 +0200 Subject: [PATCH 35/55] Fixing Data Readme --- data/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/data/README.md b/data/README.md index 34d7a5659..c452ece7c 100644 --- a/data/README.md +++ b/data/README.md @@ -10,10 +10,10 @@ curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarit ## Convert the JSON data files to JSONL. ```shell -python3 ./scripts/alpaca_json_to_jsonl.py --input data/alpaca_data_gpt4.json > data/alpaca_data_gpt4.jsonl -python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/vicuna_cleaned.json > data/vicuna_cleaned.jsonl -python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/roleplay-similarity_0.6-instruct-dataset.json > data/roleplay-similarity_0.6-instruct-dataset.jsonl -python3 ./scripts/alpaca_json_to_jsonl.py --input data/raw/gpt4-instruct-similarity-0.6-dataset.json > data/gpt4-instruct-similarity-0.6-dataset.jsonl +python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl +python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl +python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl +python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl ``` --- From 47d601fa2389a7f7a0dac0bd767e669c3a326cbe Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 25 Jun 2023 10:19:49 -0400 Subject: [PATCH 36/55] optionally define whether to use_fast tokenizer --- README.md | 2 ++ src/axolotl/utils/models.py | 5 +++++ tests/test_tokenizers.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 tests/test_tokenizers.py diff --git a/README.md b/README.md index 5fbac1a48..047d6aa34 100644 --- a/README.md +++ b/README.md @@ -302,6 +302,8 @@ model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Trust remote code for untrusted source trust_remote_code: +# use_fast option for tokenizer loading from_pretrained, default to True +tokenizer_use_fast: # whether you are training a 4-bit GPTQ quantized model gptq: true diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 2ae9a26aa..6d94cd674 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -34,15 +34,20 @@ def load_tokenizer( tokenizer_type, cfg, ): + use_fast = True # this is the default + if cfg.tokenizer_use_fast is not None: + use_fast = cfg.tokenizer_use_fast if tokenizer_type: tokenizer = getattr(transformers, tokenizer_type).from_pretrained( tokenizer_config, trust_remote_code=cfg.trust_remote_code or False, + use_fast=use_fast, ) else: tokenizer = AutoTokenizer.from_pretrained( tokenizer_config, trust_remote_code=cfg.trust_remote_code or False, + use_fast=use_fast, ) logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py new file mode 100644 index 000000000..f2521e8e7 --- /dev/null +++ b/tests/test_tokenizers.py @@ -0,0 +1,31 @@ +""" +Test cases for the tokenizer loading +""" +import unittest + +from axolotl.utils.dict import DictDefault +from axolotl.utils.models import load_tokenizer + + +class TestTokenizers(unittest.TestCase): + """ + test class for the load_tokenizer fn + """ + + def test_default_use_fast(self): + cfg = DictDefault({}) + tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg) + assert "Fast" in tokenizer.__class__.__name__ + + def test_dont_use_fast(self): + cfg = DictDefault( + { + "tokenizer_use_fast": False, + } + ) + tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg) + assert "Fast" not in tokenizer.__class__.__name__ + + +if __name__ == "__main__": + unittest.main() From 645c13592c06f653fd6337194d20dddba8ae8bf2 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 25 Jun 2023 10:26:02 -0400 Subject: [PATCH 37/55] better py3 support w pre-commit --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b0eb2db49..c811a6eb3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ default_language_version: - python: python3.9 + python: python3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks From 8d20e0a3d3f44721bb3e45f4a6d51577dd7099bc Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 17 Jun 2023 19:22:58 -0400 Subject: [PATCH 38/55] initial wip to get sys prompt from dataset --- src/axolotl/prompt_strategies/alpaca_chat.py | 6 +- src/axolotl/prompt_tokenizers.py | 4 +- src/axolotl/prompters.py | 87 ++++++++++++-------- tests/test_prompters.py | 69 +++++++++++++++- 4 files changed, 126 insertions(+), 40 deletions(-) diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py index 6161d7e37..32801c3c3 100644 --- a/src/axolotl/prompt_strategies/alpaca_chat.py +++ b/src/axolotl/prompt_strategies/alpaca_chat.py @@ -45,8 +45,10 @@ class NoSystemPrompter(AlpacaPrompter): Null Prompter with no system prompts """ - prompt_input = "{instruction} {input} " - prompt_no_input = "{instruction} " + system_prompt = "" + system_no_input_prompt = "" + turn_format = "{instruction} {input} " + turn_no_input_format = "{instruction} " def __init__(self): # pylint: disable=super-init-not-called pass diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index 6408620d7..cf80539eb 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -87,7 +87,9 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy): Tokenizing strategy for instruction-based prompts. """ - def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]: + def parse_instruction_fields( + self, prompt + ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]: raise NotImplementedError def tokenize_prompt(self, prompt): diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index 29cc4446b..4db915238 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -24,6 +24,8 @@ class AlpacaPrompter: system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n" system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n" + turn_format: str + turn_no_input_format: str prompt_style: Optional[PromptStyle] = None def __init__(self, prompt_style=PromptStyle.INSTRUCT.value): @@ -32,23 +34,13 @@ class AlpacaPrompter: def match_prompt_style(self): if self.prompt_style == PromptStyle.INSTRUCT.value: - self.prompt_input = ( - self.system_prompt - + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" + self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" + self.turn_no_input_format = ( + "### Instruction:\n{instruction}\n\n### Response:\n" ) - self.prompt_no_input = ( - self.system_no_input_prompt - + "### Instruction:\n{instruction}\n\n### Response:\n" - ) - self.response_split = "### Response:" if self.prompt_style == PromptStyle.CHAT.value: - self.prompt_input = ( - self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:" - ) - self.prompt_no_input = ( - self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:" - ) - self.response_split = "ASSISTANT:" + self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:" + self.turn_no_input_format = "USER: {instruction}\nASSISTANT:" def build_prompt( self, @@ -59,15 +51,39 @@ class AlpacaPrompter: # returns the full prompt from instruction and optional input # if a label (=response, =output) is provided, it's also appended. if input: - res = self.prompt_input.format(instruction=instruction, input=input) + res = self.system_prompt + self.turn_format.format( + instruction=instruction, input=input + ) else: - res = self.prompt_no_input.format(instruction=instruction) + res = self.system_no_input_prompt + self.turn_no_input_format.format( + instruction=instruction + ) if output: res = f"{res}{output}" yield res - def get_response(self, output: str) -> str: - return output.split(self.response_split)[1].strip() + +class SystemDataPrompter(AlpacaPrompter): + """ + Alpaca Style Prompter that uses system prompts from the dataset + """ + + def build_prompt_w_system( + self, + system: str, + instruction: str, + input: Union[None, str] = None, # pylint: disable=redefined-builtin + output: Union[None, str] = None, + ) -> Generator[str, None, None]: + # returns the full prompt from instruction and optional input + # if a label (=response, =output) is provided, it's also appended. + if input: + res = system + self.turn_format.format(instruction=instruction, input=input) + else: + res = system + self.turn_no_input_format.format(instruction=instruction) + if output: + res = f"{res}{output}" + yield res class UnpromptedPrompter(AlpacaPrompter): @@ -93,7 +109,10 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter): """ system_prompt = ( - "Choose the answer that best answers the question. Explain your reasoning." + "Choose the answer that best answers the question. Explain your reasoning.\n" + ) + system_no_input_prompt = ( + "Choose the answer that best answers the question. Explain your reasoning.\n" ) @@ -102,7 +121,12 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter): Prompter for multiple choice concise """ - prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n" + system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n" + system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n" + + def match_prompt_style(self): + self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:" + self.turn_no_input_format = "USER: {instruction}\nASSISTANT:" class SummarizeTLDRPrompter(AlpacaPrompter): @@ -110,9 +134,12 @@ class SummarizeTLDRPrompter(AlpacaPrompter): Prompter for summarize TLDR """ - prompt_no_input = ( - "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:" - ) + system_prompt = "" + system_no_input_prompt = "" + + def match_prompt_style(self): + self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:" + self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:" class CompletionPrompter: @@ -128,9 +155,6 @@ class CompletionPrompter: ) -> Generator[str, None, None]: yield instruction - def get_response(self, output: str) -> str: - return output.strip() - class GPTeacherPrompter(AlpacaPrompter): """ @@ -210,9 +234,6 @@ class ReflectAlpacaPrompter: res = f"{res}{label}" yield res - def get_response(self, output: str) -> str: - return output.split(self.response_split)[1].strip() - class SeparatorStyle(Enum): """Different separator style.""" @@ -289,12 +310,6 @@ class ShareGPTPrompter: # pylint: disable=too-few-public-methods sep2=" ", ) - # def match_prompt_style(self): - # if self.prompt_style == PromptStyle.chat.value: - # self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:" - # self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:" - # self.response_split = "ASSISTANT:" - def build_prompt(self, source) -> Generator[str, None, None]: # ignore the system prompt if provided if source[0]["from"] == "system": diff --git a/tests/test_prompters.py b/tests/test_prompters.py index 11610ccc5..bb33afbb6 100644 --- a/tests/test_prompters.py +++ b/tests/test_prompters.py @@ -2,7 +2,13 @@ import unittest -from axolotl.prompters import AlpacaPrompter, PromptStyle +from axolotl.prompters import ( + AlpacaPrompter, + MultipleChoiceExplainPrompter, + PromptStyle, + SystemDataPrompter, + UnpromptedPrompter, +) class AlpacaPrompterTest(unittest.TestCase): @@ -55,3 +61,64 @@ class AlpacaPrompterTest(unittest.TestCase): assert "### Response:" not in res assert "USER:" in res assert "ASSISTANT:" in res + + def test_system_prompt(self): + prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value) + res = next( + prompter.build_prompt_w_system( + "use cot", "tell me a joke about the following", "alpacas" + ) + ) + assert "use cot" in res + assert res.startswith("use cot") + assert "### Instruction:" not in res + assert "### Input:" not in res + assert "alpacas" in res + assert "### Response:" not in res + assert "USER:" in res + assert "ASSISTANT:" in res + + +class UnpromptedPrompterTest(unittest.TestCase): + """ + Test class for UnpromptedPrompter with no system prompts + """ + + def test_prompt_style_w_none(self): + prompter = UnpromptedPrompter(prompt_style=None) + res = next(prompter.build_prompt("tell me a joke")) + assert "### Instruction:" in res + assert "tell me a joke" in res + assert res.startswith("###") + + def test_prompt_style_w_instruct(self): + prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value) + res = next( + prompter.build_prompt("tell me a joke about the following", "alpacas") + ) + assert "### Instruction:" in res + assert "tell me a joke" in res + assert res.startswith("###") + + def test_prompt_style_w_chat(self): + prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value) + res = next( + prompter.build_prompt("tell me a joke about the following", "alpacas") + ) + assert "USER:" in res + assert "tell me a joke" in res + assert res.startswith("USER:") + + +class MultipleChoiceExplainPrompterTest(unittest.TestCase): + """ + Test class for MultipleChoiceExplainPrompter + """ + + def test_prompt_style_w_chat(self): + prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value) + res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C")) + assert "USER:" in res + assert "choose one" in res + assert "Choose the answer that best answers the question." in res + assert "- A\n- B\n- C" in res From 3a38271276224741fc9b2766b322a9bc54bba9c3 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 17 Jun 2023 23:52:40 -0400 Subject: [PATCH 39/55] add tests and supoort for loader for sys prompt data --- .../prompt_strategies/alpaca_w_system.py | 83 +++++++++++++++++++ src/axolotl/prompters.py | 23 ----- src/axolotl/utils/tokenization.py | 2 + tests/test_prompt_tokenizers.py | 40 ++++++++- tests/test_prompters.py | 2 +- 5 files changed, 125 insertions(+), 25 deletions(-) create mode 100644 src/axolotl/prompt_strategies/alpaca_w_system.py diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py new file mode 100644 index 000000000..88acf0d0e --- /dev/null +++ b/src/axolotl/prompt_strategies/alpaca_w_system.py @@ -0,0 +1,83 @@ +""" +Prompt strategies loader for alpaca instruction datasets with system prompts +""" +from typing import Generator, Tuple, Union + +from axolotl.prompt_tokenizers import PromptTokenizingStrategy +from axolotl.prompters import AlpacaPrompter, PromptStyle + + +class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy): + """ + Tokenizing strategy for instruction-based prompts. + """ + + def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: + return ( + prompt["instruction"], + prompt["input"] if "input" in prompt else "", + prompt["output"], + prompt["system"], + ) + + def tokenize_prompt(self, prompt): + ( + instruction, + input, # pylint: disable=redefined-builtin + response, + system, + ) = self.parse_instruction_fields(prompt) + user_prompt = next( + iter( + self.prompter.build_prompt_w_system( + system, + instruction, + input, + ) + ) + ) + tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False) + if not self.train_on_inputs: + user_prompt_len = len(tokenized_prompt["input_ids"]) + # TODO this could be sped up using numpy array slicing + tokenized_prompt["labels"] = [-100] * user_prompt_len + tokenized_res_prompt = self._tokenize( + response, strip_bos_token=True, add_eos_token=True + ) + tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"] + tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"] + tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"] + + return tokenized_prompt + + +class SystemDataPrompter(AlpacaPrompter): + """ + Alpaca Style Prompter that uses system prompts from the dataset + """ + + def build_prompt_w_system( + self, + system: str, + instruction: str, + input: Union[None, str] = None, # pylint: disable=redefined-builtin + output: Union[None, str] = None, + ) -> Generator[str, None, None]: + # returns the full prompt from instruction and optional input + # if a label (=response, =output) is provided, it's also appended. + if input: + res = system + self.turn_format.format(instruction=instruction, input=input) + else: + res = system + self.turn_no_input_format.format(instruction=instruction) + if output: + res = f"{res}{output}" + yield res + + +def load(tokenizer, cfg): + return InstructionWSystemPromptTokenizingStrategy( + SystemDataPrompter(PromptStyle.CHAT.value), + tokenizer, + cfg.train_on_inputs, + cfg.sequence_len, + ) diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index 4db915238..715a227c8 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -63,29 +63,6 @@ class AlpacaPrompter: yield res -class SystemDataPrompter(AlpacaPrompter): - """ - Alpaca Style Prompter that uses system prompts from the dataset - """ - - def build_prompt_w_system( - self, - system: str, - instruction: str, - input: Union[None, str] = None, # pylint: disable=redefined-builtin - output: Union[None, str] = None, - ) -> Generator[str, None, None]: - # returns the full prompt from instruction and optional input - # if a label (=response, =output) is provided, it's also appended. - if input: - res = system + self.turn_format.format(instruction=instruction, input=input) - else: - res = system + self.turn_no_input_format.format(instruction=instruction) - if output: - res = f"{res}{output}" - yield res - - class UnpromptedPrompter(AlpacaPrompter): """ Prompter for alpaca no system prompt diff --git a/src/axolotl/utils/tokenization.py b/src/axolotl/utils/tokenization.py index 1c535eb1b..7d0d1dd83 100644 --- a/src/axolotl/utils/tokenization.py +++ b/src/axolotl/utils/tokenization.py @@ -34,3 +34,5 @@ def check_example_labels(example, tokenizer): logging.info(" ".join(colored_tokens)) logging.info("\n\n\n") + + return " ".join(colored_tokens) diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py index aba340eee..3ddbe77bf 100644 --- a/tests/test_prompt_tokenizers.py +++ b/tests/test_prompt_tokenizers.py @@ -7,11 +7,15 @@ from pathlib import Path from transformers import AutoTokenizer from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter +from axolotl.prompt_strategies.alpaca_w_system import ( + InstructionWSystemPromptTokenizingStrategy, + SystemDataPrompter, +) from axolotl.prompt_tokenizers import ( AlpacaPromptTokenizingStrategy, ShareGPTPromptTokenizingStrategy, ) -from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter +from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter logging.basicConfig(level="INFO") @@ -96,5 +100,39 @@ class TestPromptTokenizationStrategies(unittest.TestCase): assert example["labels"][world_idx - 1] == -100 +class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase): + """ + Test class for prompt tokenization strategies with sys prompt from the dataset + """ + + def setUp(self) -> None: + # pylint: disable=duplicate-code + self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") + self.tokenizer.add_special_tokens( + { + "bos_token": "", + "eos_token": "", + "unk_token": "", + } + ) + + def test_system_alpaca(self): + prompter = SystemDataPrompter(PromptStyle.CHAT.value) + strat = InstructionWSystemPromptTokenizingStrategy( + prompter, + self.tokenizer, + False, + 2048, + ) + sample = { + "system": "use cot", + "instruction": "hello!", + "output": "Hi! How can I help?", + } + example = strat.tokenize_prompt(sample) + assert example["input_ids"][0:3] == [1, 671, 20118] # use cot + assert example["input_ids"][3] == 11889 # USER + + if __name__ == "__main__": unittest.main() diff --git a/tests/test_prompters.py b/tests/test_prompters.py index bb33afbb6..756b6f81b 100644 --- a/tests/test_prompters.py +++ b/tests/test_prompters.py @@ -2,11 +2,11 @@ import unittest +from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter from axolotl.prompters import ( AlpacaPrompter, MultipleChoiceExplainPrompter, PromptStyle, - SystemDataPrompter, UnpromptedPrompter, ) From 7b57ed761882b4492659eeafffbf8ffddd3f0fbb Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 18 Jun 2023 06:40:28 -0400 Subject: [PATCH 40/55] pylint for duplicated code for system prompts --- src/axolotl/datasets.py | 1 + src/axolotl/prompt_strategies/alpaca_w_system.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py index 40c58bc9c..5593a8dd3 100644 --- a/src/axolotl/datasets.py +++ b/src/axolotl/datasets.py @@ -126,6 +126,7 @@ class ConstantLengthDataset(IterableDataset): buffer_len = 0 if example: + # FIXME # just going to drop data points that are too long if len(example["input_ids"]) <= self.seq_length: input_ids = example["input_ids"] diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py index 88acf0d0e..aacae8739 100644 --- a/src/axolotl/prompt_strategies/alpaca_w_system.py +++ b/src/axolotl/prompt_strategies/alpaca_w_system.py @@ -21,6 +21,7 @@ class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy): ) def tokenize_prompt(self, prompt): + # pylint: disable=duplicate-code ( instruction, input, # pylint: disable=redefined-builtin From 05ab9092e304f234801d6496cecb60d49d86c0a4 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 25 Jun 2023 22:40:50 -0400 Subject: [PATCH 41/55] skip the system prompt --- src/axolotl/prompt_strategies/alpaca_instruct.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/axolotl/prompt_strategies/alpaca_instruct.py b/src/axolotl/prompt_strategies/alpaca_instruct.py index 2e42191f8..143f070f2 100644 --- a/src/axolotl/prompt_strategies/alpaca_instruct.py +++ b/src/axolotl/prompt_strategies/alpaca_instruct.py @@ -1,7 +1,7 @@ """Module loading the AlpacaInstructPromptTokenizingStrategy class""" from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy -from axolotl.prompters import AlpacaPrompter, PromptStyle +from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter def load(tokenizer, cfg): @@ -11,3 +11,12 @@ def load(tokenizer, cfg): cfg.train_on_inputs, cfg.sequence_len, ) + + +def load_no_prompt(tokenizer, cfg): + return AlpacaPromptTokenizingStrategy( + UnpromptedPrompter(PromptStyle.INSTRUCT.value), + tokenizer, + cfg.train_on_inputs, + cfg.sequence_len, + ) From 612aabd8c468b6f1aeda80fdec5ec4a4bc3ae159 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 27 Jun 2023 15:40:25 -0400 Subject: [PATCH 42/55] push intermediate model checkpoints to hub --- src/axolotl/prompt_strategies/alpaca_chat.py | 11 ++++++++++- src/axolotl/utils/trainer.py | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py index 6161d7e37..952a55961 100644 --- a/src/axolotl/prompt_strategies/alpaca_chat.py +++ b/src/axolotl/prompt_strategies/alpaca_chat.py @@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import ( AlpacaPromptTokenizingStrategy, InstructionPromptTokenizingStrategy, ) -from axolotl.prompters import AlpacaPrompter, PromptStyle +from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter def load(tokenizer, cfg): @@ -103,3 +103,12 @@ def load_camel_ai(tokenizer, cfg): cfg.train_on_inputs, cfg.sequence_len, ) + + +def load_no_prompt(tokenizer, cfg): + return AlpacaPromptTokenizingStrategy( + UnpromptedPrompter(PromptStyle.CHAT.value), + tokenizer, + cfg.train_on_inputs, + cfg.sequence_len, + ) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 5cf3107f3..e9ec641a6 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -124,6 +124,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): if cfg.max_grad_norm: training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm + if cfg.push_to_hub_model_id: + training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id + training_arguments_kwargs["push_to_hub"] = True + training_args = transformers.TrainingArguments( per_device_train_batch_size=cfg.micro_batch_size, per_device_eval_batch_size=cfg.eval_batch_size From 924bbfddecfcd8b9ddfb5d0bad3b89d4a00edaac Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 28 Jun 2023 22:27:17 -0400 Subject: [PATCH 43/55] add option for instruct w sys prompts --- src/axolotl/prompt_strategies/alpaca_w_system.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py index aacae8739..bcdcd9334 100644 --- a/src/axolotl/prompt_strategies/alpaca_w_system.py +++ b/src/axolotl/prompt_strategies/alpaca_w_system.py @@ -76,6 +76,19 @@ class SystemDataPrompter(AlpacaPrompter): def load(tokenizer, cfg): + return load_chat(tokenizer, cfg) + + +def load_instruct(tokenizer, cfg): + return InstructionWSystemPromptTokenizingStrategy( + SystemDataPrompter(PromptStyle.INSTRUCT.value), + tokenizer, + cfg.train_on_inputs, + cfg.sequence_len, + ) + + +def load_chat(tokenizer, cfg): return InstructionWSystemPromptTokenizingStrategy( SystemDataPrompter(PromptStyle.CHAT.value), tokenizer, From 530809fd7405f2abb1b88ab8d6d3cb78e5e765bb Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 28 Jun 2023 22:36:28 -0400 Subject: [PATCH 44/55] update pip install command for apex --- docker/Dockerfile-base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base index 2728f3a72..20bd80f70 100644 --- a/docker/Dockerfile-base +++ b/docker/Dockerfile-base @@ -77,7 +77,7 @@ FROM base-builder RUN python3 -m pip uninstall -y apex RUN git clone https://github.com/NVIDIA/apex # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners -RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check . +RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ RUN mkdir -p /workspace/builds COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes From 77bdb7d1444cd0fbd822a1a68fc2db6abbb78814 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 29 Jun 2023 14:29:55 +0900 Subject: [PATCH 45/55] Fix typing list --- src/axolotl/prompt_tokenizers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py index cf80539eb..8216d73dd 100644 --- a/src/axolotl/prompt_tokenizers.py +++ b/src/axolotl/prompt_tokenizers.py @@ -440,7 +440,7 @@ def parse_tokenized_to_result( result: Dict[str, List[int]], current_len: int, res: Dict[str, List[int]], - labels: list[int], + labels: List[int], pad_token_id: Union[int, None] = None, ) -> Tuple[Dict[str, List[int]], int]: """ From c146880a7559d8f6b6553561cd11ad7d1745b6ae Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Fri, 30 Jun 2023 11:33:53 +0900 Subject: [PATCH 46/55] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 047d6aa34..27aec72db 100644 --- a/README.md +++ b/README.md @@ -336,6 +336,8 @@ datasets: dataset_prepared_path: data/last_run_prepared # push prepared dataset to hub push_dataset_to_hub: # repo path +# push checkpoints to hub +push_to_hub_model_id: # repo path # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets # required to be true when used in combination with `push_dataset_to_hub` hf_use_auth_token: # boolean From 78a1e1fa12b7b4698328a21e15abbc0958e8babf Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 1 Jul 2023 00:19:41 -0400 Subject: [PATCH 47/55] open orca support --- README.md | 4 ++++ .../prompt_strategies/alpaca_w_system.py | 23 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/README.md b/README.md index 27aec72db..4929987cb 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended): ```json {"message_1": "...", "message_2": "..."} ``` +- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct + ```json + {"system_prompt": "...", "question": "...", "response": "..."} + ``` - `context_qa`: in context question answering from an article ```json {"article": "...", "question": "...", "answer": "..."} diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py index aacae8739..1b4f50219 100644 --- a/src/axolotl/prompt_strategies/alpaca_w_system.py +++ b/src/axolotl/prompt_strategies/alpaca_w_system.py @@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter): yield res +class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy): + """ + Tokenizing strategy for OpenOrca datasets + """ + + def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: + return ( + prompt["question"], + "", + prompt["response"], + prompt["system_prompt"], + ) + + def load(tokenizer, cfg): return InstructionWSystemPromptTokenizingStrategy( SystemDataPrompter(PromptStyle.CHAT.value), @@ -82,3 +96,12 @@ def load(tokenizer, cfg): cfg.train_on_inputs, cfg.sequence_len, ) + + +def load_open_orca(tokenizer, cfg): + return OpenOrcaPromptTokenizingStrategy( + SystemDataPrompter(PromptStyle.INSTRUCT.value), + tokenizer, + cfg.train_on_inputs, + cfg.sequence_len, + ) From a10da1caff183cf986975a06f5c7ffc4f300fb22 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 1 Jul 2023 00:29:07 -0400 Subject: [PATCH 48/55] 11.7.0 nvidia/cuda docker images are deprecated, move to 11.7.1 --- .github/workflows/base.yml | 2 +- .github/workflows/main.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 623083db2..f3ad69570 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -26,7 +26,7 @@ jobs: pytorch: 2.0.0 axolotl_extras: - cuda: "117" - cuda_version: 11.7.0 + cuda_version: 11.7.1 python_version: "3.9" pytorch: 1.13.1 axolotl_extras: diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 033199154..07f25cac6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,7 +30,7 @@ jobs: pytorch: 2.0.0 axolotl_extras: gptq - cuda: cu117 - cuda_version: 11.7.0 + cuda_version: 11.7.1 python_version: "3.9" pytorch: 1.13.1 axolotl_extras: @@ -85,7 +85,7 @@ jobs: pytorch: 2.0.0 axolotl_extras: gptq - cuda: cu117 - cuda_version: 11.7.0 + cuda_version: 11.7.1 python_version: "3.9" pytorch: 1.13.1 axolotl_extras: From 71456955f5da8015dacb138ec70b9693d33a037b Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 2 Jul 2023 22:26:51 -0400 Subject: [PATCH 49/55] pin pydantic so deepspeed isn't broken --- docker/Dockerfile-base | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base index 20bd80f70..adf7996ee 100644 --- a/docker/Dockerfile-base +++ b/docker/Dockerfile-base @@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install RUN git lfs install --skip-repo RUN pip3 install awscli && \ # The base image ships with `pydantic==1.8.2` which is not working - pip3 install -U --no-cache-dir pydantic + pip3 install -U --no-cache-dir pydantic==1.10.10 From e79c8e617e1584a0fe4cac33c263237178b561ce Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Mon, 3 Jul 2023 12:44:29 +0900 Subject: [PATCH 50/55] Fix future deprecation push_to_hub_model_id --- README.md | 2 +- src/axolotl/utils/trainer.py | 4 ++-- src/axolotl/utils/validation.py | 5 +++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4929987cb..e45ac54b7 100644 --- a/README.md +++ b/README.md @@ -341,7 +341,7 @@ dataset_prepared_path: data/last_run_prepared # push prepared dataset to hub push_dataset_to_hub: # repo path # push checkpoints to hub -push_to_hub_model_id: # repo path +hub_model_id: # repo path # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets # required to be true when used in combination with `push_dataset_to_hub` hf_use_auth_token: # boolean diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index e9ec641a6..263d6c78d 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -124,8 +124,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): if cfg.max_grad_norm: training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm - if cfg.push_to_hub_model_id: - training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id + if cfg.hub_model_id: + training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id training_arguments_kwargs["push_to_hub"] = True training_args = transformers.TrainingArguments( diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index 2e0da69b3..43b4b1d16 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -92,6 +92,11 @@ def validate_config(cfg): ): logging.warning("adamw hyperparameters found, but no adamw optimizer set") + if cfg.push_to_hub_model_id: + raise ValueError( + "push_to_hub_model_id is deprecated. Please use hub_model_id instead." + ) + # TODO # MPT 7b # https://github.com/facebookresearch/bitsandbytes/issues/25 From 9e64f42e0fe2f3a5075cf516c8ea0d95837e1ff5 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 6 Jul 2023 23:08:09 +0900 Subject: [PATCH 51/55] Fix local path loading and custom strategy type --- README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e45ac54b7..88e8b28ca 100644 --- a/README.md +++ b/README.md @@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended): #### How to add custom prompts 1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example. - 2. Use your custom file name as the dataset type. + 2. Use your custom file name as the dataset type `.load_`. Optionally, download some datasets, see [data/README.md](data/README.md) @@ -255,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic - dataset ```yaml + sequence_len: 2048 # max token length for prompt + + # huggingface repo datasets: - - path: vicgalle/alpaca-gpt4 # local or huggingface repo + - path: vicgalle/alpaca-gpt4 + type: alpaca # format from earlier + + # local + datasets: + - path: json + data_files: data.jsonl # or json type: alpaca # format from earlier - sequence_len: 2048 # max token length / prompt ``` - loading @@ -328,10 +336,10 @@ tf32: true # require >=ampere # a list of one or more datasets to finetune the model with datasets: - # this can be either a hf dataset, or relative path + # hf dataset repo | "json" for local dataset, make sure to fill data_files - path: vicgalle/alpaca-gpt4 # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] - type: alpaca # format OR format:prompt_style (chat/instruct) + type: alpaca # format | format: (chat/instruct) | .load_ data_files: # path to source data files shards: # number of shards to split data into From 41da98b9823ee13234321be089d3d761c53b7529 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 6 Jul 2023 23:20:11 +0900 Subject: [PATCH 52/55] Fix for linter --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 88e8b28ca..6b81e69de 100644 --- a/README.md +++ b/README.md @@ -256,8 +256,8 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic - dataset ```yaml sequence_len: 2048 # max token length for prompt - - # huggingface repo + + # huggingface repo datasets: - path: vicgalle/alpaca-gpt4 type: alpaca # format from earlier From 66afb76a15cb0f930baab850e77cc16d0cdfd029 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 7 Jul 2023 21:31:02 -0400 Subject: [PATCH 53/55] don't use llama if trust_remote_code is set since that needs to use AutoModel path --- src/axolotl/utils/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 6d94cd674..95311ca2b 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -202,7 +202,7 @@ def load_model( else True, ) load_in_8bit = False - elif cfg.is_llama_derived_model: + elif cfg.is_llama_derived_model and not cfg.trust_remote_code: from transformers import LlamaForCausalLM config = LlamaConfig.from_pretrained(base_model_config) From d69da99c2c43c035c5ee7a425ad9c85aeef81dfb Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 7 Jul 2023 21:33:11 -0400 Subject: [PATCH 54/55] skip explicit model type too if using trust_remote_code --- src/axolotl/utils/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 95311ca2b..7181cca31 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -241,7 +241,7 @@ def load_model( # device=cfg.device, # ) # model.train() # sets to train instead of eval mode - elif model_type: + elif model_type and not cfg.trust_remote_code: model = getattr(transformers, model_type).from_pretrained( base_model, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, From 19cf0bda99b0957dd4ccd2152d27faa84f6f58a8 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 8 Jul 2023 12:13:39 -0400 Subject: [PATCH 55/55] params are adam_*, not adamw_* --- src/axolotl/utils/validation.py | 2 +- tests/test_validation.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/axolotl/utils/validation.py b/src/axolotl/utils/validation.py index 43b4b1d16..40dfb84a9 100644 --- a/src/axolotl/utils/validation.py +++ b/src/axolotl/utils/validation.py @@ -87,7 +87,7 @@ def validate_config(cfg): "You probably want to disable group_by_length as it will force a streamed dataset to download completely." ) - if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and ( + if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and ( not cfg.optimizer or "adamw" not in cfg.optimizer ): logging.warning("adamw hyperparameters found, but no adamw optimizer set") diff --git a/tests/test_validation.py b/tests/test_validation.py index d39a4618e..88c97f0b7 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase): cfg = DictDefault( { "optimizer": None, - "adamw_epsilon": 0.0001, + "adam_epsilon": 0.0001, } ) @@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase): cfg = DictDefault( { "optimizer": "adafactor", - "adamw_beta1": 0.0001, + "adam_beta1": 0.0001, } ) @@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase): cfg = DictDefault( { "optimizer": "adamw_bnb_8bit", - "adamw_beta1": 0.0001, - "adamw_beta2": 0.0001, - "adamw_epsilon": 0.0001, + "adam_beta1": 0.9, + "adam_beta2": 0.99, + "adam_epsilon": 0.0001, } )