From 17605b85d8046b7dee53289175dea17b8700fe0b Mon Sep 17 00:00:00 2001 From: Maxime <672982+maximegmd@users.noreply.github.com> Date: Sat, 26 Aug 2023 22:40:56 +0200 Subject: [PATCH 1/6] fix: inference did not move the model to the correct device (#483) --- scripts/finetune.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/finetune.py b/scripts/finetune.py index 3255a623f..d02448ec2 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -82,6 +82,8 @@ def do_inference(cfg, model, tokenizer, prompter: Optional[str]): max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None ) + model = model.to(cfg.device) + while True: print("=" * 80) # support for multiline inputs From 868530c39c2d2a5ddcce1483bd73951fb376e18b Mon Sep 17 00:00:00 2001 From: Aman Karmani Date: Sat, 26 Aug 2023 21:40:12 +0000 Subject: [PATCH 2/6] let transformers handle adamw_bnb_8bit --- src/axolotl/utils/trainer.py | 73 +----------------------------------- 1 file changed, 2 insertions(+), 71 deletions(-) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index c73b4a713..24be1b8c2 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -10,19 +10,13 @@ from functools import partial from pathlib import Path from typing import Optional, Union -import bitsandbytes as bnb import numpy as np import torch.cuda -import transformers from datasets import Dataset, set_caching_enabled -from torch import nn from torch.optim.lr_scheduler import OneCycleLR from torch.utils.data import DataLoader, DistributedSampler, RandomSampler from transformers import EarlyStoppingCallback, Trainer, TrainingArguments -from transformers.trainer_pt_utils import ( - SequentialDistributedSampler, - get_parameter_names, -) +from transformers.trainer_pt_utils import SequentialDistributedSampler from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler from axolotl.utils.callbacks import ( @@ -32,10 +26,7 @@ from axolotl.utils.callbacks import ( ) from axolotl.utils.collators import DataCollatorForSeq2Seq from axolotl.utils.dataloader import MultipackDistributedDataloader -from axolotl.utils.schedulers import ( - InterpolatingLogScheduler, - get_cosine_schedule_with_quadratic_warmup, -) +from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup LOG = logging.getLogger("axolotl") @@ -570,66 +561,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_ if Path(cfg.torchdistx_path).exists(): sys.path.append(cfg.torchdistx_path) importlib.import_module("torchdistx") - if ( - cfg.optimizer == "adamw_bnb_8bit" - and not cfg.gptq - and "deepspeed" not in training_arguments_kwargs - and not cfg.fsdp - ): - decay_parameters = get_parameter_names(model, [nn.LayerNorm]) - decay_parameters = [name for name in decay_parameters if "bias" not in name] - optimizer_grouped_parameters = [ - { - "params": [ - p - for n, p in model.named_parameters() - if (n in decay_parameters and p.requires_grad) - ], - "weight_decay": training_args.weight_decay, - }, - { - "params": [ - p - for n, p in model.named_parameters() - if (n not in decay_parameters and p.requires_grad) - ], - "weight_decay": 0.0, - }, - ] - - optimizer = bnb.optim.Adam8bit( - optimizer_grouped_parameters, - betas=(training_args.adam_beta1, training_args.adam_beta2), - eps=training_args.adam_epsilon, - lr=training_args.learning_rate, - ) - - if cfg.lr_scheduler == "one_cycle": - lr_scheduler_kwargs = ( - cfg.lr_scheduler_kwargs if cfg.lr_scheduler_kwargs else {} - ) - lr_scheduler = OneCycleLR( - optimizer, - cfg.learning_rate, - total_steps=total_num_steps, - epochs=cfg.num_epochs, - div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6, - **lr_scheduler_kwargs, - ) - elif cfg.lr_scheduler == "log_sweep": - lr_scheduler = InterpolatingLogScheduler( - optimizer, - cfg.warmup_steps, - cfg.log_sweep_min_lr if cfg.log_sweep_min_lr else 1e-10, - cfg.log_sweep_max_lr if cfg.log_sweep_max_lr else 10, - ) - else: - lr_scheduler = transformers.get_cosine_schedule_with_warmup( - optimizer, - training_args.warmup_steps, - total_num_steps, - ) - trainer_kwargs["optimizers"] = (optimizer, lr_scheduler) callbacks = [] callbacks.append(GPUStatsCallback(cfg)) From fe4d6baf9286e0eea18a3e752099a4fa16aef606 Mon Sep 17 00:00:00 2001 From: "Charles O. Goddard" Date: Sat, 26 Aug 2023 18:08:34 -0700 Subject: [PATCH 3/6] Add example Llama 2 ReLoRA config (#471) * Add example Llama 2 ReLoRA config * Use adamw_bnb_8bit in example relora config --- examples/llama-2/relora.yml | 73 +++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 examples/llama-2/relora.yml diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml new file mode 100644 index 000000000..66515dabc --- /dev/null +++ b/examples/llama-2/relora.yml @@ -0,0 +1,73 @@ +base_model: meta-llama/Llama-2-7b-hf +base_model_config: meta-llama/Llama-2-7b-hf +model_type: LlamaForCausalLM +tokenizer_type: LlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: teknium/GPT4-LLM-Cleaned + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./relora-out + +adapter: qlora +lora_model_dir: + +sequence_len: 4096 +sample_packing: true + +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: +lora_target_linear: true +lora_fan_in_fan_out: + +relora_steps: 150 +relora_warmup_steps: 10 +relora_cpu_offload: false + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 4 +num_epochs: 3 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: 50 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" From ad8be435ad42dc7f1feb3740a2b7b961f23364f8 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Sun, 27 Aug 2023 10:09:09 +0900 Subject: [PATCH 4/6] Feat(doc): Update eval_steps doc (#487) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a81ac8b50..94427fcd0 100644 --- a/README.md +++ b/README.md @@ -521,7 +521,7 @@ lr_quadratic_warmup: logging_steps: save_strategy: # set to `no` to skip checkpoint saves save_steps: # leave empty to save at each epoch -eval_steps: +eval_steps: # leave empty to eval at each epoch save_total_limit: # checkpoints saved at a time max_steps: From 3fc900629881e369a41ab656d4811c3a0410ea89 Mon Sep 17 00:00:00 2001 From: mhenrichsen Date: Sun, 27 Aug 2023 03:10:33 +0200 Subject: [PATCH 5/6] Feat(deepspeed): Add zero2 config (#476) * zero2 config * config added * linting --------- Co-authored-by: mhenrichsen --- deepspeed/zero2.json | 46 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 deepspeed/zero2.json diff --git a/deepspeed/zero2.json b/deepspeed/zero2.json new file mode 100644 index 000000000..9b8a2a9b6 --- /dev/null +++ b/deepspeed/zero2.json @@ -0,0 +1,46 @@ +{ + "zero_optimization": { + "stage": 2, + "offload_optimizer": { + "device": "cpu" + }, + "contiguous_gradients": true, + "overlap_comm": true + }, + "bf16": { + "enabled": "auto" + }, + "fp16": { + "enabled": "auto", + "auto_cast": false, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": [ + 0.9, + 0.999 + ], + "eps": 1e-8, + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto", + "total_num_steps": "auto" + } + }, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} From 35130711d634966de5aee6a94b3613551c521db6 Mon Sep 17 00:00:00 2001 From: mhenrichsen Date: Sun, 27 Aug 2023 03:20:17 +0200 Subject: [PATCH 6/6] Feat(cfg): Add code-llama configs for all sizes (#479) * configs for all sizes * update tokenizer type --------- Co-authored-by: mhenrichsen --- examples/code-llama/13b/lora.yml | 67 ++++++++++++++++++++++++++++++ examples/code-llama/13b/qlora.yml | 69 +++++++++++++++++++++++++++++++ examples/code-llama/34b/lora.yml | 67 ++++++++++++++++++++++++++++++ examples/code-llama/34b/qlora.yml | 69 +++++++++++++++++++++++++++++++ examples/code-llama/7b/lora.yml | 67 ++++++++++++++++++++++++++++++ examples/code-llama/7b/qlora.yml | 69 +++++++++++++++++++++++++++++++ examples/code-llama/README.md | 22 ++++++++++ 7 files changed, 430 insertions(+) create mode 100644 examples/code-llama/13b/lora.yml create mode 100644 examples/code-llama/13b/qlora.yml create mode 100644 examples/code-llama/34b/lora.yml create mode 100644 examples/code-llama/34b/qlora.yml create mode 100644 examples/code-llama/7b/lora.yml create mode 100644 examples/code-llama/7b/qlora.yml create mode 100644 examples/code-llama/README.md diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml new file mode 100644 index 000000000..637c05143 --- /dev/null +++ b/examples/code-llama/13b/lora.yml @@ -0,0 +1,67 @@ +base_model: codellama/CodeLlama-13b-hf +base_model_config: codellama/CodeLlama-13b-hf +model_type: LlamaForCausalLM +tokenizer_type: CodeLlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./lora-out + +sequence_len: 100000 +sample_packing: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 3 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml new file mode 100644 index 000000000..ae78f5bf2 --- /dev/null +++ b/examples/code-llama/13b/qlora.yml @@ -0,0 +1,69 @@ +base_model: codellama/CodeLlama-13b-hf +base_model_config: codellama/CodeLlama-13b-hf +model_type: LlamaForCausalLM +tokenizer_type: CodeLlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./qlora-out + +adapter: qlora +lora_model_dir: + +sequence_len: 100000 +sample_packing: true + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 3 +optimizer: paged_adamw_32bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml new file mode 100644 index 000000000..9c4cfee10 --- /dev/null +++ b/examples/code-llama/34b/lora.yml @@ -0,0 +1,67 @@ +base_model: codellama/CodeLlama-34b-hf +base_model_config: codellama/CodeLlama-34b-hf +model_type: LlamaForCausalLM +tokenizer_type: CodeLlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./lora-out + +sequence_len: 100000 +sample_packing: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 3 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml new file mode 100644 index 000000000..9f5ce50f9 --- /dev/null +++ b/examples/code-llama/34b/qlora.yml @@ -0,0 +1,69 @@ +base_model: codellama/CodeLlama-34b-hf +base_model_config: codellama/CodeLlama-34b-hf +model_type: LlamaForCausalLM +tokenizer_type: CodeLlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./qlora-out + +adapter: qlora +lora_model_dir: + +sequence_len: 100000 +sample_packing: true + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 3 +optimizer: paged_adamw_32bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml new file mode 100644 index 000000000..dfa3f2f7a --- /dev/null +++ b/examples/code-llama/7b/lora.yml @@ -0,0 +1,67 @@ +base_model: codellama/CodeLlama-7b-hf +base_model_config: codellama/CodeLlama-7b-hf +model_type: LlamaForCausalLM +tokenizer_type: CodeLlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./lora-out + +sequence_len: 100000 +sample_packing: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 3 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml new file mode 100644 index 000000000..704f058c3 --- /dev/null +++ b/examples/code-llama/7b/qlora.yml @@ -0,0 +1,69 @@ +base_model: codellama/CodeLlama-7b-hf +base_model_config: codellama/CodeLlama-7b-hf +model_type: LlamaForCausalLM +tokenizer_type: CodeLlamaTokenizer +is_llama_derived_model: true + +load_in_8bit: false +load_in_4bit: true +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./qlora-out + +adapter: qlora +lora_model_dir: + +sequence_len: 100000 +sample_packing: true + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_run_id: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 3 +optimizer: paged_adamw_32bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: true +fp16: false +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true + +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: + bos_token: "" + eos_token: "" + unk_token: "" diff --git a/examples/code-llama/README.md b/examples/code-llama/README.md new file mode 100644 index 000000000..a5011e347 --- /dev/null +++ b/examples/code-llama/README.md @@ -0,0 +1,22 @@ +# Overview + +This is an example of CodeLLaMA configuration for 7b, 13b and 34b. + +The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes. + +The 13b variant will fit if you change these settings to these values: +gradient_accumulation_steps: 2 +micro_batch_size: 1 + +The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices. + +```shell +accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml + +``` +or + +```shell +accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml + +```