From 8e197f6fb48da66a82e804dca39bc5932c7055e3 Mon Sep 17 00:00:00 2001
From: Birch-san <Birch-san@users.noreply.github.com>
Date: Mon, 28 Aug 2023 23:47:16 +0100
Subject: [PATCH] pad_to_worst_case_seq_len boolean, for testing memory limits
 (#498)

* pad_to_worst_case_seq_len boolean, for testing memory limits

* remove collator_pad_to_longest option since it does nothing

see docs: https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorWithPadding.padding

True and "longest" mean the same thing

* rename to `pad_to_sequence_len, and ensure 64 alignment

---------

Co-authored-by: Aman Karmani <aman@tmm1.net>
---
 README.md                      | 6 +++---
 examples/pythia-12b/config.yml | 1 -
 src/axolotl/utils/trainer.py   | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 94427fcd0..a18c1108f 100644
--- a/README.md
+++ b/README.md
@@ -459,6 +459,9 @@ dataset_shard_idx:
 # the maximum length of an input to train with, this should typically be less than 2048
 # as most models have a token/context limit of 2048
 sequence_len: 2048
+# pad inputs so each step uses constant sized buffers
+# this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+pad_to_sequence_len:
 # max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # FutureWarning: This will soon be DEPRECATED
@@ -610,9 +613,6 @@ deepspeed:
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
 
-# Set padding for data collator to 'longest'
-collator_pad_to_longest:
-
 # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
 pretraining_dataset:
 
diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml
index 535e5cd37..dc06eb6b6 100644
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -47,4 +47,3 @@ local_rank:
 gradient_checkpointing: true
 fsdp:
 fsdp_config:
-collator_pad_to_longest: true
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 24be1b8c2..1bc190fe2 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -585,10 +585,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
         callbacks.append(SaveBetterTransformerModelCallback)
 
     data_collator_kwargs = {
-        "padding": True,
+        "padding": True,  # True/"longest" is the default
     }
-    if cfg.collator_pad_to_longest:
-        data_collator_kwargs["padding"] = "longest"
+    if cfg.pad_to_sequence_len:
+        data_collator_kwargs["pad_to_multiple_of"] = 64 * round(cfg.sequence_len / 64)
     else:
         # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
         # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html