more sane defaults for openllama 3b used for quickstarts (#602)

* more sane defaults for openllama 3b used for quickstarts

* don't use bf16 for quickstart to simplify gpu compatibility

* use the update openlm-research/open_llama_3b_v2 models
This commit is contained in:
Wing Lian
2023-09-19 09:15:10 -04:00
committed by GitHub
parent 1eebbd09c3
commit 674c57692d
3 changed files with 37 additions and 37 deletions

View File

@@ -1,5 +1,5 @@
base_model: openlm-research/open_llama_3b base_model: openlm-research/open_llama_3b_v2
base_model_config: openlm-research/open_llama_3b base_model_config: openlm-research/open_llama_3b_v2
model_type: LlamaForCausalLM model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
@@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared
val_set_size: 0.02 val_set_size: 0.02
adapter: adapter:
lora_model_dir: lora_model_dir:
sequence_len: 256 sequence_len: 1024
max_packed_sequence_len: sample_packing: true
lora_r: lora_r:
lora_alpha: lora_alpha:
lora_dropout: lora_dropout:
@@ -29,11 +29,11 @@ wandb_log_model:
output_dir: ./openllama-out output_dir: ./openllama-out
gradient_accumulation_steps: 1 gradient_accumulation_steps: 1
micro_batch_size: 1 micro_batch_size: 1
num_epochs: 3 num_epochs: 4
optimizer: adamw_bnb_8bit optimizer: adamw_bnb_8bit
torchdistx_path: torchdistx_path:
lr_scheduler: cosine lr_scheduler: cosine
learning_rate: 0.00001 learning_rate: 0.000003
train_on_inputs: false train_on_inputs: false
group_by_length: false group_by_length: false
float16: true float16: true
@@ -45,12 +45,12 @@ early_stopping_patience:
resume_from_checkpoint: resume_from_checkpoint:
local_rank: local_rank:
logging_steps: 1 logging_steps: 1
xformers_attention: true xformers_attention:
flash_attention: flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 20
eval_steps: 50 eval_steps: 0.05
save_steps: save_steps:
debug: debug:
deepspeed: deepspeed:

View File

@@ -1,5 +1,5 @@
base_model: openlm-research/open_llama_3b base_model: openlm-research/open_llama_3b_v2
base_model_config: openlm-research/open_llama_3b base_model_config: openlm-research/open_llama_3b_v2
model_type: LlamaForCausalLM model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
load_in_8bit: true load_in_8bit: true
@@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared
val_set_size: 0.02 val_set_size: 0.02
adapter: lora adapter: lora
lora_model_dir: lora_model_dir:
sequence_len: 256 sequence_len: 1024
max_packed_sequence_len: sample_packing: true
lora_r: 8 lora_r: 8
lora_alpha: 16 lora_alpha: 16
lora_dropout: 0.0 lora_dropout: 0.0
@@ -33,9 +33,9 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./lora-out output_dir: ./lora-out
batch_size: 16 gradient_accumulation_steps: 1
micro_batch_size: 4 micro_batch_size: 2
num_epochs: 3 num_epochs: 4
optimizer: adamw_bnb_8bit optimizer: adamw_bnb_8bit
torchdistx_path: torchdistx_path:
lr_scheduler: cosine lr_scheduler: cosine
@@ -50,16 +50,16 @@ early_stopping_patience:
resume_from_checkpoint: resume_from_checkpoint:
local_rank: local_rank:
logging_steps: 1 logging_steps: 1
xformers_attention: true xformers_attention:
flash_attention: flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 20
eval_steps: 50 eval_steps: 0.05
save_steps: save_steps:
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.1
fsdp: fsdp:
fsdp_config: fsdp_config:
special_tokens: special_tokens:

View File

@@ -1,5 +1,5 @@
base_model: openlm-research/open_llama_3b base_model: openlm-research/open_llama_3b_v2
base_model_config: openlm-research/open_llama_3b base_model_config: openlm-research/open_llama_3b_v2
model_type: LlamaForCausalLM model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
load_in_8bit: false load_in_8bit: false
@@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared
val_set_size: 0.01 val_set_size: 0.01
adapter: qlora adapter: qlora
lora_model_dir: lora_model_dir:
sequence_len: 2048 sequence_len: 1024
max_packed_sequence_len: 2048 sample_packing: true
lora_r: 8 lora_r: 8
lora_alpha: 32 lora_alpha: 32
lora_dropout: 0.05 lora_dropout: 0.05
@@ -27,33 +27,33 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
output_dir: ./qlora-out output_dir: ./qlora-out
batch_size: 4 gradient_accumulation_steps: 1
micro_batch_size: 4 micro_batch_size: 2
num_epochs: 2 num_epochs: 4
optimizer: paged_adamw_32bit optimizer: paged_adamw_32bit
torchdistx_path: torchdistx_path:
lr_scheduler: cosine lr_scheduler: cosine
learning_rate: 0.0002 learning_rate: 0.0002
train_on_inputs: false train_on_inputs: false
group_by_length: false group_by_length: false
bf16: true bf16: false
fp16: false fp16: true
tf32: true tf32: false
gradient_checkpointing: true gradient_checkpointing: true
early_stopping_patience: early_stopping_patience:
resume_from_checkpoint: resume_from_checkpoint:
local_rank: local_rank:
logging_steps: 1 logging_steps: 1
xformers_attention: true xformers_attention:
flash_attention: flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 20
eval_steps: 20 eval_steps: 0.05
save_steps: save_steps:
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.1
fsdp: fsdp:
fsdp_config: fsdp_config:
special_tokens: special_tokens: