From 8e197f6fb48da66a82e804dca39bc5932c7055e3 Mon Sep 17 00:00:00 2001 From: Birch-san Date: Mon, 28 Aug 2023 23:47:16 +0100 Subject: [PATCH] pad_to_worst_case_seq_len boolean, for testing memory limits (#498) * pad_to_worst_case_seq_len boolean, for testing memory limits * remove collator_pad_to_longest option since it does nothing see docs: https://huggingface.co/docs/transformers/main_classes/data_collator#transformers.DataCollatorWithPadding.padding True and "longest" mean the same thing * rename to `pad_to_sequence_len, and ensure 64 alignment --------- Co-authored-by: Aman Karmani --- README.md | 6 +++--- examples/pythia-12b/config.yml | 1 - src/axolotl/utils/trainer.py | 6 +++--- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 94427fcd0..a18c1108f 100644 --- a/README.md +++ b/README.md @@ -459,6 +459,9 @@ dataset_shard_idx: # the maximum length of an input to train with, this should typically be less than 2048 # as most models have a token/context limit of 2048 sequence_len: 2048 +# pad inputs so each step uses constant sized buffers +# this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently +pad_to_sequence_len: # max sequence length to concatenate training samples together up to # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning # FutureWarning: This will soon be DEPRECATED @@ -610,9 +613,6 @@ deepspeed: # Path to torch distx for optim 'adamw_anyprecision' torchdistx_path: -# Set padding for data collator to 'longest' -collator_pad_to_longest: - # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize pretraining_dataset: diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml index 535e5cd37..dc06eb6b6 100644 --- a/examples/pythia-12b/config.yml +++ b/examples/pythia-12b/config.yml @@ -47,4 +47,3 @@ local_rank: gradient_checkpointing: true fsdp: fsdp_config: -collator_pad_to_longest: true diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 24be1b8c2..1bc190fe2 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -585,10 +585,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_ callbacks.append(SaveBetterTransformerModelCallback) data_collator_kwargs = { - "padding": True, + "padding": True, # True/"longest" is the default } - if cfg.collator_pad_to_longest: - data_collator_kwargs["padding"] = "longest" + if cfg.pad_to_sequence_len: + data_collator_kwargs["pad_to_multiple_of"] = 64 * round(cfg.sequence_len / 64) else: # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html