make pad_to_sequence_len default to the same value as sample_packing (#2941) [skip ci]

* make pad_to_sequence_len default to the same value as sample_packing * remove duplicate validation * fix test * update description meta Co-authored-by: NanoCode012 <nano@axolotl.ai> --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>
2025-07-21 11:40:56 -04:00
parent db5f6f4693
commit af8d257aa2
90 changed files with 109 additions and 90 deletions
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/archived/gemma/qlora.yml
+++ b/examples/archived/gemma/qlora.yml
@@ -25,7 +25,7 @@ lora_target_linear: true
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -16,7 +16,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter:
 lora_model_dir:
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -19,7 +19,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -19,7 +19,7 @@ lora_model_dir:

 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 eval_sample_packing: false

 adapter: lora
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -21,7 +21,7 @@ lora_model_dir:
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/cloud/modal.yaml
+++ b/examples/cloud/modal.yaml
@@ -26,5 +26,3 @@ timeout: 86400
 # Preprocess specific configurations
 memory_preprocess: 32
 timeout_preprocess: 14400
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -27,7 +27,7 @@ lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -12,7 +12,7 @@ output_dir: ./outputs/out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -30,7 +30,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -25,7 +25,7 @@ lora_model_dir:

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
@@ -38,7 +38,7 @@ lora_target_modules:
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/falcon-h1/falcon-h1-1b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml
@@ -38,7 +38,7 @@ lora_target_modules:
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/falcon-h1/falcon-h1-34b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-34b-qlora.yaml
@@ -38,7 +38,7 @@ lora_target_modules:
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/falcon-h1/falcon-h1-3b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-3b-qlora.yaml
@@ -38,7 +38,7 @@ lora_target_modules:
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/falcon-h1/falcon-h1-500m-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-500m-qlora.yaml
@@ -38,7 +38,7 @@ lora_target_modules:
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/falcon-h1/falcon-h1-7b-qlora.yaml
+++ b/examples/falcon-h1/falcon-h1-7b-qlora.yaml
@@ -38,7 +38,7 @@ lora_target_modules:
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -31,7 +31,7 @@ lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -18,7 +18,7 @@ remove_unused_columns: false
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -35,7 +35,7 @@ lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -25,7 +25,7 @@ lora_model_dir:

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -17,7 +17,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 16
 lora_alpha: 32
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -23,7 +23,7 @@ save_safetensors: true
 adapter: qlora
 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 16
 lora_alpha: 16
--- a/examples/lfm2/lfm2-350m-fft.yaml
+++ b/examples/lfm2/lfm2-350m-fft.yaml
@@ -18,7 +18,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter:
 lora_model_dir:
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/lisa-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter:
 lora_model_dir:
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 512
 sample_packing: false
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -18,7 +18,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 8
 lora_alpha: 16
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -22,7 +22,7 @@ datasets:
 output_dir: ./outputs/qat_out/

 sample_packing: true
-pad_to_sequence_len: true
+
 sequence_len: 512

 flex_attention: true
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -26,7 +26,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -11,7 +11,7 @@ output_dir: ./outputs/out

 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -37,7 +37,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -28,7 +28,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -49,7 +49,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -22,7 +22,7 @@ dataset_exact_deduplication: true
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -14,7 +14,7 @@ lora_model_dir:

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 16
 lora_alpha: 32
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -15,7 +15,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 16
 lora_alpha: 32
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -24,7 +24,7 @@ sample_packing: true
 sample_packing_sequentially: true
 curriculum_sampling: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -15,7 +15,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 16
 lora_alpha: 32
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -18,7 +18,7 @@ output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -18,7 +18,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -18,7 +18,7 @@ adapter: qlora

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 16
 lora_alpha: 16
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 512
 sample_packing: false
-pad_to_sequence_len: true
+

 lora_r: 8
 lora_alpha: 16
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -16,7 +16,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 eval_sample_packing: false

 wandb_project:
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
@@ -47,7 +47,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
@@ -48,7 +48,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
@@ -51,7 +51,7 @@ output_dir: ./outputs/out

 sequence_len: 4096  # up to 8k will work on a single H100
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -46,7 +46,7 @@ output_dir: ./outputs/out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 gradient_accumulation_steps: 1
 micro_batch_size: 2
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -51,7 +51,7 @@ output_dir: ./outputs/out

 sequence_len: 4096  # up to 8k will work on a single H100
 sample_packing: true
-pad_to_sequence_len: true
+

 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -23,7 +23,7 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -22,7 +22,7 @@ lora_model_dir:

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -27,7 +27,7 @@ output_dir: ./outputs/out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -14,7 +14,7 @@ output_dir: ./outputs/out

 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+
 eval_sample_packing: false

 wandb_project:
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -18,7 +18,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -31,7 +31,7 @@ output_dir: ./outputs/dpo-qlora

 sequence_len: 2048
 sample_packing: false
-pad_to_sequence_len: true
+

 adapter: qlora
 lora_model_dir:
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -25,7 +25,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -34,7 +34,7 @@ lora_model_dir:

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -25,7 +25,7 @@ output_dir: ./outputs/out

 sequence_len: 8000
 sample_packing: true
-pad_to_sequence_len: true
+

 gradient_accumulation_steps: 1
 micro_batch_size: 1
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -20,7 +20,7 @@ lora_model_dir:

 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+

 lora_r: 32
 lora_alpha: 16
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -18,7 +18,7 @@ output_dir: ./outputs/out

 sequence_len: 8192
 sample_packing: true
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -28,7 +28,7 @@ output_dir: ./outputs/lora-out

 sequence_len: 4096
 sample_packing: false
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -15,7 +15,7 @@ output_dir: ./outputs/phi-sft-out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter:
 lora_model_dir:
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -18,7 +18,7 @@ output_dir: ./outputs/phi-sft-out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: qlora
 lora_model_dir:
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -15,7 +15,7 @@ output_dir: ./outputs/phi-sft-out

 sequence_len: 2048
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter:
 lora_model_dir:
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -15,7 +15,7 @@ output_dir: ./phi-sft-out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+
 trust_remote_code: true

 adapter:
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -18,7 +18,7 @@ output_dir: ./out

 sequence_len: 4096
 sample_packing: true
-pad_to_sequence_len: true
+

 adapter: lora
 lora_model_dir:
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -27,7 +27,7 @@ output_dir: ./outputs/dpo-out

 sequence_len: 2048
 sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/qwen2/prm.yaml
+++ b/examples/qwen2/prm.yaml
@@ -22,7 +22,7 @@ remove_unused_columns: false
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/qwen2/qlora-fsdp.yaml
+++ b/examples/qwen2/qlora-fsdp.yaml
@@ -17,7 +17,7 @@ output_dir: ./outputs/out
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+

 adapter: qlora
 lora_model_dir:
--- a/examples/qwen2/reward-model.yaml
+++ b/examples/qwen2/reward-model.yaml
@@ -18,7 +18,7 @@ remove_unused_columns: false
 sequence_len: 2048
 sample_packing: false
 eval_sample_packing: false
-pad_to_sequence_len: true
+

 wandb_project:
 wandb_entity:
--- a/examples/qwen3/32b-qlora.yaml
+++ b/examples/qwen3/32b-qlora.yaml
@@ -22,7 +22,7 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+

 load_in_4bit: true
 adapter: qlora
--- a/examples/qwen3/8b-qat-fsdp2.yml
+++ b/examples/qwen3/8b-qat-fsdp2.yml
@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/
 sequence_len: 2048
 sample_packing: true
 flex_attention: true
-pad_to_sequence_len: true
+

 flex_attn_compile_kwargs:
  dynamic: false
--- a/examples/qwen3/qlora-fsdp.yaml
+++ b/examples/qwen3/qlora-fsdp.yaml
@@ -16,7 +16,7 @@ output_dir: ./outputs/out
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
-pad_to_sequence_len: true
+

 adapter: qlora
 lora_model_dir:
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -435,7 +435,7 @@ class AxolotlInputConfig(
    pad_to_sequence_len: bool | None = Field(
        default=None,
        json_schema_extra={
-            "description": "Pad inputs so each step uses constant sized buffers. This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently"
+            "description": "Pad inputs so each step uses constant sized buffers. This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to True if `sample_packing` enabled"
        },
    )
    curriculum_sampling: bool | None = Field(
--- a/tests/utils/schemas/validation/test_default_values.py
+++ b/tests/utils/schemas/validation/test_default_values.py
@@ -0,0 +1,21 @@
+"""Tests for default values for configurations"""
+
+from axolotl.utils.config import validate_config
+from axolotl.utils.dict import DictDefault
+
+
+class TestDefaultConfigValues:
+    """Tests for default values for configurations"""
+
+    def test_pad_to_sequence_len(self, min_base_cfg):
+        """Tests that sample packing automatically sets pad_to_sequence_len to True"""
+        cfg = (
+            DictDefault(
+                sample_packing=True,
+            )
+            | min_base_cfg
+        )
+
+        cfg = validate_config(cfg)
+
+        assert cfg.pad_to_sequence_len is True