From 674c57692d8f9712314c5238681d1b532af57142 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 19 Sep 2023 09:15:10 -0400 Subject: [PATCH] more sane defaults for openllama 3b used for quickstarts (#602) * more sane defaults for openllama 3b used for quickstarts * don't use bf16 for quickstart to simplify gpu compatibility * use the update openlm-research/open_llama_3b_v2 models --- examples/openllama-3b/config.yml | 20 ++++++++++---------- examples/openllama-3b/lora.yml | 24 ++++++++++++------------ examples/openllama-3b/qlora.yml | 30 +++++++++++++++--------------- 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index 0d8144d6b..961aeabda 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -1,5 +1,5 @@ -base_model: openlm-research/open_llama_3b -base_model_config: openlm-research/open_llama_3b +base_model: openlm-research/open_llama_3b_v2 +base_model_config: openlm-research/open_llama_3b_v2 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false @@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared val_set_size: 0.02 adapter: lora_model_dir: -sequence_len: 256 -max_packed_sequence_len: +sequence_len: 1024 +sample_packing: true lora_r: lora_alpha: lora_dropout: @@ -29,11 +29,11 @@ wandb_log_model: output_dir: ./openllama-out gradient_accumulation_steps: 1 micro_batch_size: 1 -num_epochs: 3 +num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine -learning_rate: 0.00001 +learning_rate: 0.000003 train_on_inputs: false group_by_length: false float16: true @@ -45,12 +45,12 @@ early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -xformers_attention: true -flash_attention: +xformers_attention: +flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 10 -eval_steps: 50 +warmup_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml index acf0826c9..17fa7fa8b 100644 --- a/examples/openllama-3b/lora.yml +++ b/examples/openllama-3b/lora.yml @@ -1,5 +1,5 @@ -base_model: openlm-research/open_llama_3b -base_model_config: openlm-research/open_llama_3b +base_model: openlm-research/open_llama_3b_v2 +base_model_config: openlm-research/open_llama_3b_v2 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: true @@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared val_set_size: 0.02 adapter: lora lora_model_dir: -sequence_len: 256 -max_packed_sequence_len: +sequence_len: 1024 +sample_packing: true lora_r: 8 lora_alpha: 16 lora_dropout: 0.0 @@ -33,9 +33,9 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-out -batch_size: 16 -micro_batch_size: 4 -num_epochs: 3 +gradient_accumulation_steps: 1 +micro_batch_size: 2 +num_epochs: 4 optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine @@ -50,16 +50,16 @@ early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -xformers_attention: true -flash_attention: +xformers_attention: +flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 10 -eval_steps: 50 +warmup_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: -weight_decay: 0.0 +weight_decay: 0.1 fsdp: fsdp_config: special_tokens: diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index d8c43df82..deba03fd5 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -1,5 +1,5 @@ -base_model: openlm-research/open_llama_3b -base_model_config: openlm-research/open_llama_3b +base_model: openlm-research/open_llama_3b_v2 +base_model_config: openlm-research/open_llama_3b_v2 model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer load_in_8bit: false @@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared val_set_size: 0.01 adapter: qlora lora_model_dir: -sequence_len: 2048 -max_packed_sequence_len: 2048 +sequence_len: 1024 +sample_packing: true lora_r: 8 lora_alpha: 32 lora_dropout: 0.05 @@ -27,33 +27,33 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./qlora-out -batch_size: 4 -micro_batch_size: 4 -num_epochs: 2 +gradient_accumulation_steps: 1 +micro_batch_size: 2 +num_epochs: 4 optimizer: paged_adamw_32bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 train_on_inputs: false group_by_length: false -bf16: true -fp16: false -tf32: true +bf16: false +fp16: true +tf32: false gradient_checkpointing: true early_stopping_patience: resume_from_checkpoint: local_rank: logging_steps: 1 -xformers_attention: true -flash_attention: +xformers_attention: +flash_attention: true gptq_groupsize: gptq_model_v1: -warmup_steps: 10 -eval_steps: 20 +warmup_steps: 20 +eval_steps: 0.05 save_steps: debug: deepspeed: -weight_decay: 0.0 +weight_decay: 0.1 fsdp: fsdp_config: special_tokens: