From f30d062b48ddc06f01f3cb7434dc8ceca30b2ef4 Mon Sep 17 00:00:00 2001 From: Nathan Cooper Date: Mon, 26 Feb 2024 18:44:25 -0500 Subject: [PATCH] Add StableLM 2 Example Scripts (#1327) [skip ci] * Add StableLM examples and configurations * Add FFT and LORA configuration files and modify readme with usage --- examples/stablelm-2/1.6b/fft.yml | 69 +++++++++++++++++++++++++++++++ examples/stablelm-2/1.6b/lora.yml | 66 +++++++++++++++++++++++++++++ examples/stablelm-2/README.md | 36 ++++++++++++++++ 3 files changed, 171 insertions(+) create mode 100644 examples/stablelm-2/1.6b/fft.yml create mode 100644 examples/stablelm-2/1.6b/lora.yml create mode 100644 examples/stablelm-2/README.md diff --git a/examples/stablelm-2/1.6b/fft.yml b/examples/stablelm-2/1.6b/fft.yml new file mode 100644 index 000000000..f3fc16f86 --- /dev/null +++ b/examples/stablelm-2/1.6b/fft.yml @@ -0,0 +1,69 @@ +base_model: stabilityai/stablelm-2-1_6b +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +trust_remote_code: true + +load_in_8bit: false +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: last_run_prepared +val_set_size: 0.05 +output_dir: ./out + +sequence_len: 4096 +sample_packing: true +pad_to_sequence_len: true + +adapter: +lora_model_dir: +lora_r: +lora_alpha: +lora_dropout: +lora_target_linear: +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true +flash_attn_cross_entropy: false +flash_attn_rms_norm: true +flash_attn_fuse_qkv: false +flash_attn_fuse_mlp: true + +warmup_steps: 100 +evals_per_epoch: 4 +eval_table_size: +saves_per_epoch: 1 +debug: +deepspeed: #deepspeed_configs/zero2.json # multi-gpu only +weight_decay: 0.1 +fsdp: +fsdp_config: +special_tokens: diff --git a/examples/stablelm-2/1.6b/lora.yml b/examples/stablelm-2/1.6b/lora.yml new file mode 100644 index 000000000..c5051fab6 --- /dev/null +++ b/examples/stablelm-2/1.6b/lora.yml @@ -0,0 +1,66 @@ +base_model: stabilityai/stablelm-2-1_6b +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +trust_remote_code: true + +load_in_8bit: true +load_in_4bit: false +strict: false + +datasets: + - path: mhenrichsen/alpaca_2k_test + type: alpaca +dataset_prepared_path: +val_set_size: 0.05 +output_dir: ./lora-out + +sequence_len: 4096 +sample_packing: true +pad_to_sequence_len: true + +adapter: lora +lora_model_dir: +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_fan_in_fan_out: + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +train_on_inputs: false +group_by_length: false +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +early_stopping_patience: +resume_from_checkpoint: +local_rank: +logging_steps: 1 +xformers_attention: +flash_attention: true +flash_attn_cross_entropy: false +flash_attn_rms_norm: true + +warmup_steps: 10 +evals_per_epoch: 4 +saves_per_epoch: 1 +debug: +deepspeed: +weight_decay: 0.0 +fsdp: +fsdp_config: +special_tokens: diff --git a/examples/stablelm-2/README.md b/examples/stablelm-2/README.md new file mode 100644 index 000000000..0cdc2ffb0 --- /dev/null +++ b/examples/stablelm-2/README.md @@ -0,0 +1,36 @@ +# StableLM 2 + +This repository contains examples for training and processing using StableLM-2. It also includes a section to help you estimate the GPU requirements for your specific use case. + +## Estimating GPU Requirements + +| type | deepspeed | batch size | context length | vRAM GPU (GBs) | +|---------------|-----------|------------|----------------|----------------| +| full finetune | N/A | 1 | 4096 | ~21.5GBs | +| full finetune | zero2 | 1 | 4096 | ~20GBs | +| lora | N/A | 1 | 4096 | ~16.6GBs | + +The above are estimates and might differ slight depending on the setup for example whether you pack your sequence lengths or not (the above assumes you do to length 4096). + +This blog post from Hamel Husain was a great resource for estimating these numbers: https://hamel.dev/notes/llm/03_estimating_vram.html + +## Training +We have example scripts here for both full finetuning and lora using the popular alpaca dataset: + +```shell +# preprocess the dataset +CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/stablelm-2/1.6b/lora.yml +``` + +Single GPU Training: +```shell +python -m axolotl.cli.train examples/stablelm-2/fft.yml --deepspeed deepspeed_configs/zero2.json +# OR +python -m axolotl.cli.train examples/stablelm-2/1.6b/lora.yml +``` + +Multinode GPU Training with `accelerate`: +```shell +# make sure you've configured accelerate properly +accelerate launch -m axolotl.cli.train examples/stablelm-2/1.6b/fft.yml --deepspeed deepspeed_configs/zero2.json +```