From c530e4b9c877815b8f23a730014a10b81e206cdf Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 11 Jun 2023 10:09:05 -0400 Subject: [PATCH] more config pruning and migrating --- configs/llama_7B_alpaca.yml | 41 --------- configs/sample.yml | 87 ------------------- examples/gptj-qlora/config.yml | 57 ++++++++++++ .../jeopardy-bot/config.yml | 25 +++--- 4 files changed, 68 insertions(+), 142 deletions(-) delete mode 100644 configs/llama_7B_alpaca.yml delete mode 100644 configs/sample.yml create mode 100644 examples/gptj-qlora/config.yml rename configs/llama_7B_jeopardy.yml => examples/jeopardy-bot/config.yml (75%) diff --git a/configs/llama_7B_alpaca.yml b/configs/llama_7B_alpaca.yml deleted file mode 100644 index 7db2f65aa..000000000 --- a/configs/llama_7B_alpaca.yml +++ /dev/null @@ -1,41 +0,0 @@ -base_model: huggyllama/llama-7b -model_type: LlamaForCausalLM -tokenizer_type: LlamaTokenizer -load_in_8bit: true -datasets: - - path: data/alpaca_data_gpt4.jsonl - type: alpaca - - path: data/vicuna_cleaned.jsonl - type: sharegpt - - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl - type: gpteacher - - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl - type: gpteacher -dataset_prepared_path: last_run_prepared -val_set_size: 0.04 -adapter: lora -lora_model_dir: -sequence_len: 2048 -lora_r: 8 -lora_alpha: 16 -lora_dropout: 0.05 -lora_target_modules: - - q_proj - - v_proj -lora_fan_in_fan_out: false -wandb_project: llama-7b-lora -wandb_watch: -wandb_run_id: -wandb_log_model: -output_dir: ./lora-llama-alpaca -gradient_accumulation_steps: 1 -micro_batch_size: 16 -num_epochs: 5 -learning_rate: 0.00003 -train_on_inputs: false -group_by_length: false -bf16: true -tf32: true -early_stopping_patience: -resume_from_checkpoint: -local_rank: diff --git a/configs/sample.yml b/configs/sample.yml deleted file mode 100644 index ddd95cb55..000000000 --- a/configs/sample.yml +++ /dev/null @@ -1,87 +0,0 @@ -# this is the huggingface model that contains *.pt, *.safetensors, or *.bin files -# this can also be a relative path to a model on disk -base_model: decapoda-research/llama-7b-hf-int4 -# you can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc) -base_model_ignore_patterns: -# if the base_model repo on hf hub doesn't include configuration .json files, -# you can set that here, or leave this empty to default to base_model -base_model_config: decapoda-research/llama-7b-hf -# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too -model_type: AutoModelForCausalLM -# Corresponding tokenizer for the model AutoTokenizer is a good choice -tokenizer_type: AutoTokenizer -# whether you are training a 4-bit quantized model -load_4bit: true -# this will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer -load_in_8bit: true -# a list of one or more datasets to finetune the model with -datasets: - # this can be either a hf dataset, or relative path - - path: vicgalle/alpaca-gpt4 - # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] - type: alpaca -# axolotl attempts to save the dataset as an arrow after packing the data together so -# subsequent training attempts load faster, relative path -dataset_prepared_path: data/last_run_prepared -# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc -val_set_size: 0.04 -# if you want to use lora, leave blank to train all parameters in original model -adapter: lora -# if you already have a lora model trained that you want to load, put that here -lora_model_dir: -# the maximum length of an input to train with, this should typically be less than 2048 -# as most models have a token/context limit of 2048 -sequence_len: 2048 -# max sequence length to concatenate training samples together up to -# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning -max_packed_sequence_len: 1024 -# lora hyperparameters -lora_r: 8 -lora_alpha: 16 -lora_dropout: 0.05 -lora_target_modules: - - q_proj - - v_proj -# - k_proj -# - o_proj -lora_fan_in_fan_out: false -# wandb configuration if your're using it -wandb_project: -wandb_watch: -wandb_run_id: -wandb_log_model: -# where to save the finsihed model to -output_dir: ./completed-model -# training hyperparameters -gradient_accumulation_steps: 1 -batch_size: -micro_batch_size: 2 -num_epochs: 3 -warmup_steps: 100 -learning_rate: 0.00003 -# whether to mask out or include the human's prompt from the training labels -train_on_inputs: false -# don't use this, leads to wonky training (according to someone on the internet) -group_by_length: false -# Use CUDA bf16 -bf16: true -# Use CUDA tf32 -tf32: true -# does not work with current implementation of 4-bit LoRA -gradient_checkpointing: false -# stop training after this many evaluation losses have increased in a row -# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback -early_stopping_patience: 3 -# specify a scheduler to use with the optimizer. only one_cycle is supported currently -lr_scheduler: -# whether to use xformers attention patch https://github.com/facebookresearch/xformers: -xformers_attention: -# whether to use flash attention patch https://github.com/HazyResearch/flash-attention: -flash_attention: -# resume from a specific checkpoint dir -resume_from_checkpoint: -# if resume_from_checkpoint isn't set and you simply want it to start where it left off -# be careful with this being turned on between different models -auto_resume_from_checkpoints: false -# don't mess with this, it's here for accelerate and torchrun -local_rank: diff --git a/examples/gptj-qlora/config.yml b/examples/gptj-qlora/config.yml new file mode 100644 index 000000000..858c14862 --- /dev/null +++ b/examples/gptj-qlora/config.yml @@ -0,0 +1,57 @@ +base_model: EleutherAI/gpt-j-6b +base_model_config: EleutherAI/gpt-j-6b +load_in_8bit: false +load_in_4bit: true +strict: false +push_dataset_to_hub: +datasets: + - path: teknium/GPT4-LLM-Cleaned + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +adapter: qlora +lora_model_dir: +sequence_len: 2048 +max_packed_sequence_len: +lora_r: 8 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: +lora_target_linear: true +lora_fan_in_fan_out: +wandb_project: +wandb_watch: +wandb_run_id: +wandb_log_model: +output_dir: ./qlora-out +gradient_accumulation_steps: 2 +micro_batch_size: 2 +num_epochs: 2 +optimizer: paged_adamw_8bit +torchdistx_path: +lr_scheduler: cosine +learning_rate: 0.0001 +train_on_inputs: false +group_by_length: true +bf16: true +fp16: false +tf32: true +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: true +flash_attention: +gptq_groupsize: +gptq_model_v1: +warmup_steps: 10 +eval_steps: 20 +save_steps: +debug: +deepspeed: +weight_decay: 0.1 +fsdp: +fsdp_config: +special_tokens: + pad_token: "<|endoftext|>" diff --git a/configs/llama_7B_jeopardy.yml b/examples/jeopardy-bot/config.yml similarity index 75% rename from configs/llama_7B_jeopardy.yml rename to examples/jeopardy-bot/config.yml index 287d6d6ab..b803c6074 100644 --- a/configs/llama_7B_jeopardy.yml +++ b/examples/jeopardy-bot/config.yml @@ -7,30 +7,28 @@ datasets: - path: openaccess-ai-collective/jeopardy type: jeopardy dataset_prepared_path: last_run_prepared -val_set_size: 0.01 +val_set_size: 0.02 adapter: lora_model_dir: -sequence_len: 2048 -max_packed_sequence_len: 2048 -lora_r: 8 -lora_alpha: 16 -lora_dropout: 0.05 +sequence_len: 512 +max_packed_sequence_len: +lora_r: +lora_alpha: +lora_dropout: lora_target_modules: - - q_proj - - v_proj lora_fan_in_fan_out: false -wandb_project: jeopardy-bot-7b +wandb_project: wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./jeopardy-bot-7b -gradient_accumulation_steps: 2 +gradient_accumulation_steps: 1 micro_batch_size: 1 -num_epochs: 2 +num_epochs: 3 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine -learning_rate: 0.0000002 +learning_rate: 0.00003 train_on_inputs: false group_by_length: false bf16: true @@ -48,11 +46,10 @@ eval_steps: 110 save_steps: 660 debug: deepspeed: -weight_decay: 0.0001 +weight_decay: 0.1 fsdp: fsdp_config: tokens: - pad_token: "[PAD]" bos_token: "" eos_token: "" unk_token: ""