Add StableLM 2 Example Scripts (#1327) [skip ci]

* Add StableLM examples and configurations * Add FFT and LORA configuration files and modify readme with usage
2024-02-26 18:44:25 -05:00
parent 269c5436ea
commit f30d062b48
3 changed files with 171 additions and 0 deletions
--- a/examples/stablelm-2/1.6b/fft.yml
+++ b/examples/stablelm-2/1.6b/fft.yml
@@ -0,0 +1,69 @@
 base_model: stabilityai/stablelm-2-1_6b
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 output_dir: ./out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 adapter:
 lora_model_dir:
 lora_r:
 lora_alpha:
 lora_dropout:
 lora_target_linear:
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
 warmup_steps: 100
 evals_per_epoch: 4
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
 weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/stablelm-2/1.6b/lora.yml
+++ b/examples/stablelm-2/1.6b/lora.yml
@@ -0,0 +1,66 @@
 base_model: stabilityai/stablelm-2-1_6b
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./lora-out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/stablelm-2/README.md
+++ b/examples/stablelm-2/README.md
@@ -0,0 +1,36 @@
 # StableLM 2
 This repository contains examples for training and processing using StableLM-2. It also includes a section to help you estimate the GPU requirements for your specific use case.
 ## Estimating GPU Requirements
 | type          | deepspeed | batch size | context length | vRAM GPU (GBs) |
 |---------------|-----------|------------|----------------|----------------|
 | full finetune | N/A       | 1          | 4096           | ~21.5GBs       |
 | full finetune | zero2     | 1          | 4096           | ~20GBs         |
 | lora          | N/A       | 1          | 4096           | ~16.6GBs       |
 The above are estimates and might differ slight depending on the setup for example whether you pack your sequence lengths or not (the above assumes you do to length 4096).
 This blog post from Hamel Husain was a great resource for estimating these numbers: https://hamel.dev/notes/llm/03_estimating_vram.html
 ## Training
 We have example scripts here for both full finetuning and lora using the popular alpaca dataset:
 ```shell
 # preprocess the dataset
 CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/stablelm-2/1.6b/lora.yml
 ```
 Single GPU Training:
 ```shell
 python -m axolotl.cli.train examples/stablelm-2/fft.yml --deepspeed deepspeed_configs/zero2.json
 # OR
 python -m axolotl.cli.train examples/stablelm-2/1.6b/lora.yml
 ```
 Multinode GPU Training with `accelerate`:
 ```shell
 # make sure you've configured accelerate properly
 accelerate launch -m axolotl.cli.train examples/stablelm-2/1.6b/fft.yml --deepspeed deepspeed_configs/zero2.json
 ```