diff --git a/README.md b/README.md index ff4cb1076..234a54961 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,97 @@ datasets: - Optionally Download some datasets, see [data/README.md](data/README.md) -- Create a new or update the existing YAML config [config/pythia_1_2B_alpaca.yml](config/pythia_1_2B_alpaca.yml) +- Create a new or update the existing YAML config [config/sample.yml](config/sample.yml) + +```yaml +# this is the huggingface model that contains *.pt, *.safetensors, or *.bin files +# this can also be a relative path to a model on disk +base_model: decapoda-research/llama-7b-hf-int4 +# you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc) +base_model_ignore_patterns: +# if the base_model repo on hf hub doesn't include configuration .json files, +# you can set that here, or leave this empty to default to base_model +base_model_config: decapoda-research/llama-7b-hf +# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too +model_type: AutoModelForCausalLM +# Corresponding tokenizer for the model AutoTokenizer is a good choice +tokenizer_type: AutoTokenizer +# whether you are training a 4-bit quantized model +load_4bit: true +# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer +load_in_8bit: true +# a list of one or more datasets to finetune the model with +datasets: + # this can be either a hf dataset, or relative path + - path: vicgalle/alpaca-gpt4 + # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] + type: alpaca +# axolotl attempts to save the dataset as an arrow after packing the data together so +# subsequent training attempts load faster, relative path +dataset_prepared_path: data/last_run_prepared +# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc +val_set_size: 0.04 +# if you want to use lora, leave blank to train all parameters in original model +adapter: lora +# if you already have a lora model trained that you want to load, put that here +lora_model_dir: +# the maximum length of an input to train with, this should typically be less than 2048 +# as most models have a token/context limit of 2048 +sequence_len: 2048 +# max sequence length to concatenate training samples together up to +# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning +max_packed_sequence_len: 1024 +# lora hyperparameters +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: + - q_proj + - v_proj +# - k_proj +# - o_proj +lora_fan_in_fan_out: false +# wandb configuration if your're using it +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: checkpoint +# where to save the finsihed model to +output_dir: ./completed-model +# training hyperparameters +batch_size: 8 +micro_batch_size: 2 +num_epochs: 3 +warmup_steps: 100 +learning_rate: 0.00003 +# whether to mask out or include the human's prompt from the training labels +train_on_inputs: false +# don't use this, leads to wonky training (according to someone on the internet) +group_by_length: false +# Use CUDA bf16 +bf16: true +# Use CUDA tf32 +tf32: true +# does not work with current implementation of 4-bit LoRA +gradient_checkpointing: false +# stop training after this many evaluation losses have increased in a row +# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback +early_stopping_patience: 3 +# specify a scheduler to use with the optimizer. only one_cycle is supported currently +lr_scheduler: +# whether to use xformers attention patch https://github.com/facebookresearch/xformers: +xformers_attention: +# whether to use flash attention patch https://github.com/HazyResearch/flash-attention: +flash_attention: +# resume from a specific checkpoint dir +resume_from_checkpoint: +# if resume_from_checkpoint isn't set and you simply want it to start where it left off +# be careful with this being turned on between different models +auto_resume_from_checkpoints: false +# don't mess with this, it's here for accelerate and torchrun +local_rank: +``` + - Install python dependencies with ONE of the following: - `pip3 install -e .[int4]` (recommended) @@ -54,3 +144,29 @@ use_cpu: false - Train! `accelerate launch scripts/finetune.py`, make sure to choose the correct YAML config file - Alternatively you can pass in the config file like: `accelerate launch scripts/finetune.py configs/llama_7B_alpaca.yml`~~ + + +## How to start training on Runpod in under 10 minutes + +- Choose your Docker container wisely. +- I recommend `huggingface:transformers-pytorch-deepspeed-latest-gpu` see https://hub.docker.com/r/huggingface/transformers-pytorch-deepspeed-latest-gpu/ +- Once you start your runpod, and SSH into it: +```shell +source <(curl -s https://raw.githubusercontent.com/winglian/axolotl/main/scripts/setup-runpod.sh) +``` + +- Once the setup script completes +```shell +accelerate launch scripts/finetune.py configs/quickstart.yml +``` + +- Here are some helpful environment variables you'll want to manually set if you open a new shell +```shell +export WANDB_MODE=offline +export WANDB_CACHE_DIR=/workspace/data/wandb-cache +export HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets" +export HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub" +export TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub" +export NCCL_P2P_DISABLE=1 +``` + diff --git a/configs/accelerate/default_config.yaml b/configs/accelerate/default_config.yaml new file mode 100644 index 000000000..9759703af --- /dev/null +++ b/configs/accelerate/default_config.yaml @@ -0,0 +1,15 @@ +compute_environment: LOCAL_MACHINE +distributed_type: 'NO' +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 1 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false diff --git a/configs/llama_7B_4bit.yml b/configs/llama_7B_4bit.yml index 422ad5724..0033a6e3a 100644 --- a/configs/llama_7B_4bit.yml +++ b/configs/llama_7B_4bit.yml @@ -4,7 +4,7 @@ model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: true datasets: - - path: vicgalle/alpaca-gpt4 + - path: tatsu-lab/alpaca # original alpaca dataset type: alpaca dataset_prepared_path: data/last_run_prepared val_set_size: 0.04 @@ -29,6 +29,7 @@ output_dir: ./lora-test batch_size: 8 micro_batch_size: 2 num_epochs: 3 +warmup_steps: 100 learning_rate: 0.00003 train_on_inputs: false group_by_length: false @@ -37,5 +38,8 @@ tf32: true gradient_checkpointing: false early_stopping_patience: 3 resume_from_checkpoint: +auto_resume_from_checkpoints: true local_rank: load_4bit: true +xformers_attention: true +flash_attention: diff --git a/configs/quickstart.yml b/configs/quickstart.yml new file mode 100644 index 000000000..a2cbdff4d --- /dev/null +++ b/configs/quickstart.yml @@ -0,0 +1,45 @@ +base_model: decapoda-research/llama-7b-hf-int4 +base_model_config: decapoda-research/llama-7b-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer +load_in_8bit: true +datasets: + - path: tatsu-lab/alpaca # original alpaca dataset + type: alpaca +dataset_prepared_path: data/last_run_prepared +val_set_size: 0.04 +adapter: lora +lora_model_dir: +sequence_len: 1024 +max_packed_sequence_len: 1024 +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: + - q_proj + - v_proj +# - k_proj +# - o_proj +lora_fan_in_fan_out: false +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: checkpoint +output_dir: ./lora-test +batch_size: 4 +micro_batch_size: 1 +num_epochs: 3 +warmup_steps: 100 +learning_rate: 0.00003 +train_on_inputs: false +group_by_length: false +bf16: true +tf32: true +gradient_checkpointing: false +early_stopping_patience: 3 +resume_from_checkpoint: +auto_resume_from_checkpoints: true +local_rank: +load_4bit: true +xformers_attention: true +flash_attention: diff --git a/configs/sample.yml b/configs/sample.yml new file mode 100644 index 000000000..7d076c573 --- /dev/null +++ b/configs/sample.yml @@ -0,0 +1,86 @@ +# this is the huggingface model that contains *.pt, *.safetensors, or *.bin files +# this can also be a relative path to a model on disk +base_model: decapoda-research/llama-7b-hf-int4 +# you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc) +base_model_ignore_patterns: +# if the base_model repo on hf hub doesn't include configuration .json files, +# you can set that here, or leave this empty to default to base_model +base_model_config: decapoda-research/llama-7b-hf +# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too +model_type: AutoModelForCausalLM +# Corresponding tokenizer for the model AutoTokenizer is a good choice +tokenizer_type: AutoTokenizer +# whether you are training a 4-bit quantized model +load_4bit: true +# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer +load_in_8bit: true +# a list of one or more datasets to finetune the model with +datasets: + # this can be either a hf dataset, or relative path + - path: vicgalle/alpaca-gpt4 + # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] + type: alpaca +# axolotl attempts to save the dataset as an arrow after packing the data together so +# subsequent training attempts load faster, relative path +dataset_prepared_path: data/last_run_prepared +# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc +val_set_size: 0.04 +# if you want to use lora, leave blank to train all parameters in original model +adapter: lora +# if you already have a lora model trained that you want to load, put that here +lora_model_dir: +# the maximum length of an input to train with, this should typically be less than 2048 +# as most models have a token/context limit of 2048 +sequence_len: 2048 +# max sequence length to concatenate training samples together up to +# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning +max_packed_sequence_len: 1024 +# lora hyperparameters +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: + - q_proj + - v_proj +# - k_proj +# - o_proj +lora_fan_in_fan_out: false +# wandb configuration if your're using it +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: checkpoint +# where to save the finsihed model to +output_dir: ./completed-model +# training hyperparameters +batch_size: 8 +micro_batch_size: 2 +num_epochs: 3 +warmup_steps: 100 +learning_rate: 0.00003 +# whether to mask out or include the human's prompt from the training labels +train_on_inputs: false +# don't use this, leads to wonky training (according to someone on the internet) +group_by_length: false +# Use CUDA bf16 +bf16: true +# Use CUDA tf32 +tf32: true +# does not work with current implementation of 4-bit LoRA +gradient_checkpointing: false +# stop training after this many evaluation losses have increased in a row +# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback +early_stopping_patience: 3 +# specify a scheduler to use with the optimizer. only one_cycle is supported currently +lr_scheduler: +# whether to use xformers attention patch https://github.com/facebookresearch/xformers: +xformers_attention: +# whether to use flash attention patch https://github.com/HazyResearch/flash-attention: +flash_attention: +# resume from a specific checkpoint dir +resume_from_checkpoint: +# if resume_from_checkpoint isn't set and you simply want it to start where it left off +# be careful with this being turned on between different models +auto_resume_from_checkpoints: false +# don't mess with this, it's here for accelerate and torchrun +local_rank: diff --git a/requirements.txt b/requirements.txt index eecb41b42..433b97aed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,5 @@ wandb flash-attn deepspeed einops +xformers + diff --git a/scripts/finetune.py b/scripts/finetune.py index e2c813416..ba3a59a6a 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -225,7 +225,14 @@ def train( ) logging.info("Starting trainer...") - trainer.train(resume_from_checkpoint=cfg.resume_from_checkpoint) + resume_from_checkpoint = cfg.resume_from_checkpoint + if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints: + possible_checkpoints = [str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")] + if len(possible_checkpoints) > 0: + sorted_paths = sorted(possible_checkpoints, key=lambda path: int(path.split('-')[-1])) + resume_from_checkpoint = sorted_paths[-1] + logging.info(f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}") + trainer.train(resume_from_checkpoint=resume_from_checkpoint) if cfg.local_rank == 0: # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading diff --git a/scripts/setup-runpod.sh b/scripts/setup-runpod.sh new file mode 100644 index 000000000..10f71ebfd --- /dev/null +++ b/scripts/setup-runpod.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +export WANDB_MODE=offline +export WANDB_CACHE_DIR=/workspace/data/wandb-cache +mkdir -p $WANDB_CACHE_DIR + +mkdir -p /workspace/data/huggingface-cache/{hub,datasets} +export HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets" +export HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub" +export TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub" +export NCCL_P2P_DISABLE=1 + +nvidia-smi +num_gpus=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) +gpu_indices=$(seq 0 $((num_gpus - 1)) | paste -sd "," -) +export CUDA_VISIBLE_DEVICES=$gpu_indices +echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES" + +apt-get update +apt-get install -y build-essential ninja-build vim git-lfs +git lfs install +pip3 install --force-reinstall https://download.pytorch.org/whl/nightly/cu117/torch-2.0.0.dev20230301%2Bcu117-cp38-cp38-linux_x86_64.whl --index-url https://download.pytorch.org/whl/nightly/cu117 +if [ -z "${TORCH_CUDA_ARCH_LIST}" ]; then # only set this if not set yet + # this covers most common GPUs that the installed version of pytorch supports + # python -c "import torch; print(torch.cuda.get_arch_list())" + export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" +fi + +cd /workspace/ +git clone https://github.com/winglian/axolotl.git +cd axolotl +pip install -e .[int4] +mkdir -p ~/.cache/huggingface/accelerate/ +cp configs/accelerate/default_config.yml ~/.cache/huggingface/accelerate/default_config.yml diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 649578bd5..d05e62d29 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -66,7 +66,10 @@ def load_model( from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram from huggingface_hub import snapshot_download - cache_model_path = Path(snapshot_download(base_model)) + snapshot_download_kwargs = {} + if cfg.base_model_ignore_patterns: + snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns + cache_model_path = Path(snapshot_download(base_model, ** snapshot_download_kwargs)) files = ( list(cache_model_path.glob("*.pt")) + list(cache_model_path.glob("*.safetensors")) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index f1c357803..9f4262962 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -11,9 +11,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): total_num_steps = int( math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) ) - warmup_steps = min(int(0.03 * total_num_steps), 100) + warmup_steps = cfg.warmup_steps if cfg.warmup_steps else min(int(0.03 * total_num_steps), 100) logging_steps = max(min(int(0.005 * total_num_steps), 10), 1) - save_steps = eval_steps = min(int(0.05 * total_num_steps), 200) + save_steps = eval_steps = cfg.save_steps if cfg.save_steps else min(int(0.05 * total_num_steps), 200) training_arguments_kwargs = {} if cfg.bf16 == "full": @@ -45,24 +45,23 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): **training_arguments_kwargs, ) - decay_parameters = get_parameter_names(model, [nn.LayerNorm]) - decay_parameters = [name for name in decay_parameters if "bias" not in name] - optimizer_grouped_parameters = [ - { - "params": [p for n, p in model.named_parameters() if n in decay_parameters], - "weight_decay": training_args.weight_decay, - }, - { - "params": [ - p for n, p in model.named_parameters() if n not in decay_parameters - ], - "weight_decay": 0.0, - }, - ] - trainer_kwargs = {} if cfg.load_in_8bit and not cfg.load_4bit: + decay_parameters = get_parameter_names(model, [nn.LayerNorm]) + decay_parameters = [name for name in decay_parameters if "bias" not in name] + optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if n in decay_parameters], + "weight_decay": training_args.weight_decay, + }, + { + "params": [ + p for n, p in model.named_parameters() if n not in decay_parameters + ], + "weight_decay": 0.0, + }, + ] optimizer = bnb.optim.Adam8bit( optimizer_grouped_parameters, betas=(training_args.adam_beta1, training_args.adam_beta2),