Merge pull request #350 from tmm1/group-len-false-examples

set `group_by_length` to false in all examples
2023-08-09 14:48:48 -07:00
parent 176b888a63 b4d1d22782
commit f5c11f8262
6 changed files with 8 additions and 6 deletions
--- a/README.md
+++ b/README.md
@@ -426,7 +426,9 @@ save_safetensors:

 # whether to mask out or include the human's prompt from the training labels
 train_on_inputs: false
-# don't use this, leads to wonky training (according to someone on the internet)
+# group similarly sized data to minimize padding
+# may be slower to start, as it must download and sort the entire dataset
+# note that training loss may have an oscillating pattern with this enabled
 group_by_length: false

 # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -35,7 +35,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -32,7 +32,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0001
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -38,7 +38,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002

 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -39,7 +39,7 @@ lr_scheduler: cosine
 learning_rate: 0.0002

 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: false
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -34,7 +34,7 @@ torchdistx_path:
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
-group_by_length: true
+group_by_length: false
 bf16: true
 fp16: false
 tf32: true