diff --git a/README.md b/README.md index fd5a60947..67592454a 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ wandb_log_model: # 'checkpoint' output_dir: ./completed-model # training hyperparameters -batch_size: 8 +gradient_accumulation_steps: 1 micro_batch_size: 2 eval_batch_size: 2 num_epochs: 3 diff --git a/configs/cerebras_1_3B_alpaca.yml b/configs/cerebras_1_3B_alpaca.yml index 2c5534552..958bf4c5a 100644 --- a/configs/cerebras_1_3B_alpaca.yml +++ b/configs/cerebras_1_3B_alpaca.yml @@ -26,7 +26,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-alpaca -batch_size: 32 +gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 5 learning_rate: 0.0003 diff --git a/configs/galactica_1_3B.yml b/configs/galactica_1_3B.yml index cc1aa7ced..2abb4c6b4 100644 --- a/configs/galactica_1_3B.yml +++ b/configs/galactica_1_3B.yml @@ -23,7 +23,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-llama-alpaca -batch_size: 32 +gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 3 learning_rate: 0.00003 diff --git a/configs/gpt_neox_20b.yml b/configs/gpt_neox_20b.yml index 260b648b5..730afb72c 100644 --- a/configs/gpt_neox_20b.yml +++ b/configs/gpt_neox_20b.yml @@ -25,7 +25,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./gpt4all-neox-20b -batch_size: 48 +gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 5 learning_rate: 0.00003 diff --git a/configs/llama_13B_alpaca.yml b/configs/llama_13B_alpaca.yml index 3386f65c7..99c9883fe 100644 --- a/configs/llama_13B_alpaca.yml +++ b/configs/llama_13B_alpaca.yml @@ -23,7 +23,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./llama-13b-sharegpt -batch_size: 64 +gradient_accumulation_steps: 1 micro_batch_size: 2 warmup_steps: 1000 save_steps: diff --git a/configs/llama_65B_alpaca.yml b/configs/llama_65B_alpaca.yml index 1c865626d..e7d2c211c 100644 --- a/configs/llama_65B_alpaca.yml +++ b/configs/llama_65B_alpaca.yml @@ -29,7 +29,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-llama-alpaca -batch_size: 128 +gradient_accumulation_steps: 1 micro_batch_size: 16 warmup_steps: 1000 save_steps: diff --git a/configs/llama_7B_4bit.yml b/configs/llama_7B_4bit.yml index feb4e21a1..a7451516c 100644 --- a/configs/llama_7B_4bit.yml +++ b/configs/llama_7B_4bit.yml @@ -26,7 +26,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-test -batch_size: 8 +gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 3 warmup_steps: 100 diff --git a/configs/llama_7B_alpaca.yml b/configs/llama_7B_alpaca.yml index 66af807dd..7db2f65aa 100644 --- a/configs/llama_7B_alpaca.yml +++ b/configs/llama_7B_alpaca.yml @@ -28,7 +28,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-llama-alpaca -batch_size: 128 +gradient_accumulation_steps: 1 micro_batch_size: 16 num_epochs: 5 learning_rate: 0.00003 diff --git a/configs/llama_7B_jeopardy.yml b/configs/llama_7B_jeopardy.yml index 4a20ddec6..287d6d6ab 100644 --- a/configs/llama_7B_jeopardy.yml +++ b/configs/llama_7B_jeopardy.yml @@ -24,7 +24,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./jeopardy-bot-7b -batch_size: 4 +gradient_accumulation_steps: 2 micro_batch_size: 1 num_epochs: 2 optimizer: adamw_bnb_8bit diff --git a/configs/pythia_1_2B_alpaca.yml b/configs/pythia_1_2B_alpaca.yml index aa0587b90..52ed58cb5 100644 --- a/configs/pythia_1_2B_alpaca.yml +++ b/configs/pythia_1_2B_alpaca.yml @@ -28,7 +28,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-alpaca -batch_size: 48 +gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 5 learning_rate: 0.00001 diff --git a/configs/quickstart.yml b/configs/quickstart.yml index a671c6e7f..2362916fc 100644 --- a/configs/quickstart.yml +++ b/configs/quickstart.yml @@ -26,7 +26,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-test -batch_size: 4 +gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 3 warmup_steps: 100 diff --git a/configs/sample.yml b/configs/sample.yml index d8b7afa71..ddd95cb55 100644 --- a/configs/sample.yml +++ b/configs/sample.yml @@ -53,7 +53,8 @@ wandb_log_model: # where to save the finsihed model to output_dir: ./completed-model # training hyperparameters -batch_size: 8 +gradient_accumulation_steps: 1 +batch_size: micro_batch_size: 2 num_epochs: 3 warmup_steps: 100 diff --git a/configs/stability_3b.yml b/configs/stability_3b.yml index ecbd37980..83516a20a 100644 --- a/configs/stability_3b.yml +++ b/configs/stability_3b.yml @@ -22,7 +22,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./stable-alpaca-3b -batch_size: 2 +gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 optimizer: adamw_bnb_8bit diff --git a/configs/vicuna_13B_4bit_reflect.yml b/configs/vicuna_13B_4bit_reflect.yml index 7ad409f26..3e37f5334 100644 --- a/configs/vicuna_13B_4bit_reflect.yml +++ b/configs/vicuna_13B_4bit_reflect.yml @@ -30,7 +30,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./lora-reflect -batch_size: 8 +gradient_accumulation_steps: 1 micro_batch_size: 2 num_epochs: 3 learning_rate: 0.00003 diff --git a/examples/gptq-lora-7b/config.yml b/examples/gptq-lora-7b/config.yml index e676d967b..351a2bf97 100644 --- a/examples/gptq-lora-7b/config.yml +++ b/examples/gptq-lora-7b/config.yml @@ -26,7 +26,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./llama-7b-lora-int4 -batch_size: 1 +gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 3 optimizer: adamw_bnb_8bit diff --git a/examples/mpt-7b/config.yml b/examples/mpt-7b/config.yml index ca6a0d5c3..f21ce7022 100644 --- a/examples/mpt-7b/config.yml +++ b/examples/mpt-7b/config.yml @@ -24,7 +24,7 @@ wandb_watch: wandb_run_id: wandb_log_model: output_dir: ./mpt-alpaca-7b -batch_size: 1 +gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 3 optimizer: adamw_bnb_8bit