diff --git a/README.md b/README.md
index fd5a60947..67592454a 100644
--- a/README.md
+++ b/README.md
@@ -265,7 +265,7 @@ wandb_log_model: # 'checkpoint'
 output_dir: ./completed-model
 
 # training hyperparameters
-batch_size: 8
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 eval_batch_size: 2
 num_epochs: 3
diff --git a/configs/cerebras_1_3B_alpaca.yml b/configs/cerebras_1_3B_alpaca.yml
index 2c5534552..958bf4c5a 100644
--- a/configs/cerebras_1_3B_alpaca.yml
+++ b/configs/cerebras_1_3B_alpaca.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-alpaca
-batch_size: 32
+gradient_accumulation_steps: 1
 micro_batch_size: 4
 num_epochs: 5
 learning_rate: 0.0003
diff --git a/configs/galactica_1_3B.yml b/configs/galactica_1_3B.yml
index cc1aa7ced..2abb4c6b4 100644
--- a/configs/galactica_1_3B.yml
+++ b/configs/galactica_1_3B.yml
@@ -23,7 +23,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
-batch_size: 32
+gradient_accumulation_steps: 1
 micro_batch_size: 16
 num_epochs: 3
 learning_rate: 0.00003
diff --git a/configs/gpt_neox_20b.yml b/configs/gpt_neox_20b.yml
index 260b648b5..730afb72c 100644
--- a/configs/gpt_neox_20b.yml
+++ b/configs/gpt_neox_20b.yml
@@ -25,7 +25,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./gpt4all-neox-20b
-batch_size: 48
+gradient_accumulation_steps: 1
 micro_batch_size: 4
 num_epochs: 5
 learning_rate: 0.00003
diff --git a/configs/llama_13B_alpaca.yml b/configs/llama_13B_alpaca.yml
index 3386f65c7..99c9883fe 100644
--- a/configs/llama_13B_alpaca.yml
+++ b/configs/llama_13B_alpaca.yml
@@ -23,7 +23,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./llama-13b-sharegpt
-batch_size: 64
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 warmup_steps: 1000
 save_steps:
diff --git a/configs/llama_65B_alpaca.yml b/configs/llama_65B_alpaca.yml
index 1c865626d..e7d2c211c 100644
--- a/configs/llama_65B_alpaca.yml
+++ b/configs/llama_65B_alpaca.yml
@@ -29,7 +29,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
-batch_size: 128
+gradient_accumulation_steps: 1
 micro_batch_size: 16
 warmup_steps: 1000
 save_steps:
diff --git a/configs/llama_7B_4bit.yml b/configs/llama_7B_4bit.yml
index feb4e21a1..a7451516c 100644
--- a/configs/llama_7B_4bit.yml
+++ b/configs/llama_7B_4bit.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
-batch_size: 8
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 warmup_steps: 100
diff --git a/configs/llama_7B_alpaca.yml b/configs/llama_7B_alpaca.yml
index 66af807dd..7db2f65aa 100644
--- a/configs/llama_7B_alpaca.yml
+++ b/configs/llama_7B_alpaca.yml
@@ -28,7 +28,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-llama-alpaca
-batch_size: 128
+gradient_accumulation_steps: 1
 micro_batch_size: 16
 num_epochs: 5
 learning_rate: 0.00003
diff --git a/configs/llama_7B_jeopardy.yml b/configs/llama_7B_jeopardy.yml
index 4a20ddec6..287d6d6ab 100644
--- a/configs/llama_7B_jeopardy.yml
+++ b/configs/llama_7B_jeopardy.yml
@@ -24,7 +24,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./jeopardy-bot-7b
-batch_size: 4
+gradient_accumulation_steps: 2
 micro_batch_size: 1
 num_epochs: 2
 optimizer: adamw_bnb_8bit
diff --git a/configs/pythia_1_2B_alpaca.yml b/configs/pythia_1_2B_alpaca.yml
index aa0587b90..52ed58cb5 100644
--- a/configs/pythia_1_2B_alpaca.yml
+++ b/configs/pythia_1_2B_alpaca.yml
@@ -28,7 +28,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-alpaca
-batch_size: 48
+gradient_accumulation_steps: 1
 micro_batch_size: 4
 num_epochs: 5
 learning_rate: 0.00001
diff --git a/configs/quickstart.yml b/configs/quickstart.yml
index a671c6e7f..2362916fc 100644
--- a/configs/quickstart.yml
+++ b/configs/quickstart.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-test
-batch_size: 4
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 warmup_steps: 100
diff --git a/configs/sample.yml b/configs/sample.yml
index d8b7afa71..ddd95cb55 100644
--- a/configs/sample.yml
+++ b/configs/sample.yml
@@ -53,7 +53,8 @@ wandb_log_model:
 # where to save the finsihed model to
 output_dir: ./completed-model
 # training hyperparameters
-batch_size: 8
+gradient_accumulation_steps: 1
+batch_size:
 micro_batch_size: 2
 num_epochs: 3
 warmup_steps: 100
diff --git a/configs/stability_3b.yml b/configs/stability_3b.yml
index ecbd37980..83516a20a 100644
--- a/configs/stability_3b.yml
+++ b/configs/stability_3b.yml
@@ -22,7 +22,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./stable-alpaca-3b
-batch_size: 2
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
diff --git a/configs/vicuna_13B_4bit_reflect.yml b/configs/vicuna_13B_4bit_reflect.yml
index 7ad409f26..3e37f5334 100644
--- a/configs/vicuna_13B_4bit_reflect.yml
+++ b/configs/vicuna_13B_4bit_reflect.yml
@@ -30,7 +30,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./lora-reflect
-batch_size: 8
+gradient_accumulation_steps: 1
 micro_batch_size: 2
 num_epochs: 3
 learning_rate: 0.00003
diff --git a/examples/gptq-lora-7b/config.yml b/examples/gptq-lora-7b/config.yml
index e676d967b..351a2bf97 100644
--- a/examples/gptq-lora-7b/config.yml
+++ b/examples/gptq-lora-7b/config.yml
@@ -26,7 +26,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./llama-7b-lora-int4
-batch_size: 1
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 optimizer: adamw_bnb_8bit
diff --git a/examples/mpt-7b/config.yml b/examples/mpt-7b/config.yml
index ca6a0d5c3..f21ce7022 100644
--- a/examples/mpt-7b/config.yml
+++ b/examples/mpt-7b/config.yml
@@ -24,7 +24,7 @@ wandb_watch:
 wandb_run_id:
 wandb_log_model:
 output_dir: ./mpt-alpaca-7b
-batch_size: 1
+gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 3
 optimizer: adamw_bnb_8bit