From af8d257aa22f9030b0f39d5bc7b150eed459eb9a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 21 Jul 2025 11:40:56 -0400 Subject: [PATCH] make pad_to_sequence_len default to the same value as sample_packing (#2941) [skip ci] * make pad_to_sequence_len default to the same value as sample_packing * remove duplicate validation * fix test * update description meta Co-authored-by: NanoCode012 --------- Co-authored-by: NanoCode012 --- examples/archived/code-llama/13b/lora.yml | 2 +- examples/archived/code-llama/13b/qlora.yml | 2 +- examples/archived/code-llama/34b/lora.yml | 2 +- examples/archived/code-llama/34b/qlora.yml | 2 +- examples/archived/code-llama/7b/lora.yml | 2 +- examples/archived/code-llama/7b/qlora.yml | 2 +- .../deepcoder/deepcoder-14B-preview-lora.yml | 2 +- examples/archived/gemma/qlora.yml | 2 +- examples/archived/stablelm-2/1.6b/fft.yml | 2 +- examples/archived/stablelm-2/1.6b/lora.yml | 2 +- examples/archived/starcoder2/qlora.yml | 2 +- examples/archived/tiny-llama/lora-mps.yml | 2 +- examples/archived/tiny-llama/lora.yml | 2 +- examples/archived/tiny-llama/qlora.yml | 2 +- examples/cloud/modal.yaml | 2 -- examples/cohere/command-r-7b-qlora.yml | 2 +- .../cogito-v1-preview-llama-3B-lora.yml | 2 +- .../cogito-v1-preview-qwen-14B-lora.yml | 2 +- examples/deepseek-v2/fft-fsdp-16b.yaml | 2 +- examples/deepseek-v2/qlora-fsdp-2_5.yaml | 2 +- examples/devstral/devstral-small-qlora.yml | 2 +- .../falcon-h1/falcon-h1-1b-deep-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-1b-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-34b-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-3b-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-500m-qlora.yaml | 2 +- examples/falcon-h1/falcon-h1-7b-qlora.yaml | 2 +- examples/gemma2/qlora.yml | 2 +- examples/gemma2/reward-model.yaml | 2 +- examples/gemma3/gemma-3-1b-qlora.yml | 2 +- examples/gemma3/gemma-3-4b-qlora.yml | 2 +- examples/glm4/qlora-32b.yaml | 2 +- examples/jamba/qlora_fsdp_large.yaml | 2 +- examples/lfm2/lfm2-350m-fft.yaml | 2 +- examples/llama-2/fft_optimized.yml | 2 +- examples/llama-2/lisa.yml | 2 +- examples/llama-2/loftq.yml | 2 +- examples/llama-2/lora.yml | 2 +- examples/llama-2/qlora-fsdp.yml | 2 +- examples/llama-2/qlora.yml | 2 +- examples/llama-2/relora.yml | 2 +- examples/llama-3/3b-qat-fsdp2.yaml | 2 +- examples/llama-3/fft-8b-liger-fsdp.yaml | 2 +- examples/llama-3/fft-8b.yaml | 2 +- examples/llama-3/instruct-dpo-lora-8b.yml | 2 +- examples/llama-3/instruct-lora-8b.yml | 2 +- examples/llama-3/lora-1b-deduplicate-dpo.yml | 2 +- examples/llama-3/lora-1b-deduplicate-sft.yml | 2 +- examples/llama-3/lora-1b-kernels.yml | 2 +- examples/llama-3/lora-1b-ray.yml | 2 +- .../lora-1b-sample-packing-sequentially.yml | 2 +- examples/llama-3/lora-1b.yml | 2 +- examples/llama-3/lora-8b.yml | 2 +- examples/llama-3/qlora-1b.yml | 2 +- examples/llama-3/qlora-fsdp-405b.yaml | 2 +- examples/llama-3/qlora-fsdp-70b.yaml | 2 +- examples/llama-3/qlora.yml | 2 +- examples/llama-3/sparse-finetuning.yaml | 2 +- .../do-no-use-fa2/maverick-qlora-fsdp1.yaml | 2 +- .../do-no-use-fa2/scout-qlora-fsdp1.yaml | 2 +- .../scout-qlora-single-h100.yaml | 2 +- .../llama-4/scout-qlora-flexattn-fsdp2.yaml | 2 +- .../llama-4/scout-qlora-single-h100-flex.yaml | 2 +- .../magistral/magistral-small-fsdp-qlora.yaml | 2 +- examples/magistral/magistral-small-qlora.yaml | 2 +- examples/mistral/bigstral-ds-zero3.yaml | 2 +- examples/mistral/config.yml | 2 +- examples/mistral/lora-mps.yml | 2 +- examples/mistral/lora.yml | 2 +- examples/mistral/mistral-dpo-qlora.yml | 2 +- examples/mistral/mistral-qlora-orpo.yml | 2 +- examples/mistral/mixtral.yml | 2 +- examples/mistral/mixtral_22.yml | 2 +- examples/mistral/qlora.yml | 2 +- examples/orpheus/finetune.yml | 2 +- examples/phi/lora-3.5.yaml | 2 +- examples/phi/phi-ft.yml | 2 +- examples/phi/phi-qlora.yml | 2 +- examples/phi/phi2-ft.yml | 2 +- examples/phi/phi3-ft-fsdp.yml | 2 +- examples/phi/phi3-ft.yml | 2 +- examples/qwen2/dpo.yaml | 2 +- examples/qwen2/prm.yaml | 2 +- examples/qwen2/qlora-fsdp.yaml | 2 +- examples/qwen2/reward-model.yaml | 2 +- examples/qwen3/32b-qlora.yaml | 2 +- examples/qwen3/8b-qat-fsdp2.yml | 2 +- examples/qwen3/qlora-fsdp.yaml | 2 +- src/axolotl/utils/schemas/config.py | 2 +- .../schemas/validation/test_default_values.py | 21 +++++++++++++++++++ 90 files changed, 109 insertions(+), 90 deletions(-) create mode 100644 tests/utils/schemas/validation/test_default_values.py diff --git a/examples/archived/code-llama/13b/lora.yml b/examples/archived/code-llama/13b/lora.yml index 0ed2382ba..98ef516ab 100644 --- a/examples/archived/code-llama/13b/lora.yml +++ b/examples/archived/code-llama/13b/lora.yml @@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/archived/code-llama/13b/qlora.yml b/examples/archived/code-llama/13b/qlora.yml index 22bd1691b..2385368ac 100644 --- a/examples/archived/code-llama/13b/qlora.yml +++ b/examples/archived/code-llama/13b/qlora.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/archived/code-llama/34b/lora.yml b/examples/archived/code-llama/34b/lora.yml index 25dc9f421..fb44997ff 100644 --- a/examples/archived/code-llama/34b/lora.yml +++ b/examples/archived/code-llama/34b/lora.yml @@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/archived/code-llama/34b/qlora.yml b/examples/archived/code-llama/34b/qlora.yml index 0e33e2a45..22f4cae3c 100644 --- a/examples/archived/code-llama/34b/qlora.yml +++ b/examples/archived/code-llama/34b/qlora.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/archived/code-llama/7b/lora.yml b/examples/archived/code-llama/7b/lora.yml index d288b9f65..0632bdfb7 100644 --- a/examples/archived/code-llama/7b/lora.yml +++ b/examples/archived/code-llama/7b/lora.yml @@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/archived/code-llama/7b/qlora.yml b/examples/archived/code-llama/7b/qlora.yml index de41c0123..0bd462aab 100644 --- a/examples/archived/code-llama/7b/qlora.yml +++ b/examples/archived/code-llama/7b/qlora.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml index 9e92c0a07..a9511e9e3 100644 --- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml +++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml @@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/archived/gemma/qlora.yml b/examples/archived/gemma/qlora.yml index 2738112b4..80829b3c9 100644 --- a/examples/archived/gemma/qlora.yml +++ b/examples/archived/gemma/qlora.yml @@ -25,7 +25,7 @@ lora_target_linear: true sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/archived/stablelm-2/1.6b/fft.yml b/examples/archived/stablelm-2/1.6b/fft.yml index 9b45b399f..3ae08c9de 100644 --- a/examples/archived/stablelm-2/1.6b/fft.yml +++ b/examples/archived/stablelm-2/1.6b/fft.yml @@ -16,7 +16,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora_model_dir: diff --git a/examples/archived/stablelm-2/1.6b/lora.yml b/examples/archived/stablelm-2/1.6b/lora.yml index 31e5ad933..e5aa81423 100644 --- a/examples/archived/stablelm-2/1.6b/lora.yml +++ b/examples/archived/stablelm-2/1.6b/lora.yml @@ -19,7 +19,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/archived/starcoder2/qlora.yml b/examples/archived/starcoder2/qlora.yml index 18d85f9c3..889d837e8 100644 --- a/examples/archived/starcoder2/qlora.yml +++ b/examples/archived/starcoder2/qlora.yml @@ -19,7 +19,7 @@ lora_model_dir: sequence_len: 8192 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/archived/tiny-llama/lora-mps.yml b/examples/archived/tiny-llama/lora-mps.yml index 66cf7cfb3..aa3b7d851 100644 --- a/examples/archived/tiny-llama/lora-mps.yml +++ b/examples/archived/tiny-llama/lora-mps.yml @@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + eval_sample_packing: false adapter: lora diff --git a/examples/archived/tiny-llama/lora.yml b/examples/archived/tiny-llama/lora.yml index 90998880f..a92f4bd67 100644 --- a/examples/archived/tiny-llama/lora.yml +++ b/examples/archived/tiny-llama/lora.yml @@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/archived/tiny-llama/qlora.yml b/examples/archived/tiny-llama/qlora.yml index 8b2a4565a..4d422a5ee 100644 --- a/examples/archived/tiny-llama/qlora.yml +++ b/examples/archived/tiny-llama/qlora.yml @@ -21,7 +21,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/cloud/modal.yaml b/examples/cloud/modal.yaml index bbe8785f1..195031494 100644 --- a/examples/cloud/modal.yaml +++ b/examples/cloud/modal.yaml @@ -26,5 +26,3 @@ timeout: 86400 # Preprocess specific configurations memory_preprocess: 32 timeout_preprocess: 14400 - -# save_first_step: true # uncomment this to validate checkpoint saving works with your config diff --git a/examples/cohere/command-r-7b-qlora.yml b/examples/cohere/command-r-7b-qlora.yml index da2777270..b4741636b 100644 --- a/examples/cohere/command-r-7b-qlora.yml +++ b/examples/cohere/command-r-7b-qlora.yml @@ -27,7 +27,7 @@ lora_target_linear: true sequence_len: 2048 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml index 1a051b98b..6f0b505bd 100644 --- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml +++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml @@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml index 807342641..fefcfadea 100644 --- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml +++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml @@ -21,7 +21,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/deepseek-v2/fft-fsdp-16b.yaml b/examples/deepseek-v2/fft-fsdp-16b.yaml index 78bf6b179..d23c789aa 100644 --- a/examples/deepseek-v2/fft-fsdp-16b.yaml +++ b/examples/deepseek-v2/fft-fsdp-16b.yaml @@ -12,7 +12,7 @@ output_dir: ./outputs/out sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml index da1d9aefd..0536d1c10 100644 --- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml +++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml @@ -30,7 +30,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/devstral/devstral-small-qlora.yml b/examples/devstral/devstral-small-qlora.yml index 9d92e8662..7fe4dd433 100644 --- a/examples/devstral/devstral-small-qlora.yml +++ b/examples/devstral/devstral-small-qlora.yml @@ -25,7 +25,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml index 484c31fec..2473179f0 100644 --- a/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml @@ -38,7 +38,7 @@ lora_target_modules: sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/falcon-h1/falcon-h1-1b-qlora.yaml b/examples/falcon-h1/falcon-h1-1b-qlora.yaml index dea2a6e6d..bfb7836ef 100644 --- a/examples/falcon-h1/falcon-h1-1b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-1b-qlora.yaml @@ -38,7 +38,7 @@ lora_target_modules: sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/falcon-h1/falcon-h1-34b-qlora.yaml b/examples/falcon-h1/falcon-h1-34b-qlora.yaml index b187efbf6..80a9d45b5 100644 --- a/examples/falcon-h1/falcon-h1-34b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-34b-qlora.yaml @@ -38,7 +38,7 @@ lora_target_modules: sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/falcon-h1/falcon-h1-3b-qlora.yaml b/examples/falcon-h1/falcon-h1-3b-qlora.yaml index 4d981ad95..02be8ac5d 100644 --- a/examples/falcon-h1/falcon-h1-3b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-3b-qlora.yaml @@ -38,7 +38,7 @@ lora_target_modules: sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/falcon-h1/falcon-h1-500m-qlora.yaml b/examples/falcon-h1/falcon-h1-500m-qlora.yaml index 5ee13facd..b112d5d85 100644 --- a/examples/falcon-h1/falcon-h1-500m-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-500m-qlora.yaml @@ -38,7 +38,7 @@ lora_target_modules: sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/falcon-h1/falcon-h1-7b-qlora.yaml b/examples/falcon-h1/falcon-h1-7b-qlora.yaml index 4b665c3cd..c5505873d 100644 --- a/examples/falcon-h1/falcon-h1-7b-qlora.yaml +++ b/examples/falcon-h1/falcon-h1-7b-qlora.yaml @@ -38,7 +38,7 @@ lora_target_modules: sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/gemma2/qlora.yml b/examples/gemma2/qlora.yml index 68d213fad..8a295a1f8 100644 --- a/examples/gemma2/qlora.yml +++ b/examples/gemma2/qlora.yml @@ -31,7 +31,7 @@ lora_target_linear: true sequence_len: 2048 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/gemma2/reward-model.yaml b/examples/gemma2/reward-model.yaml index 624ebdcd2..67b1228b2 100644 --- a/examples/gemma2/reward-model.yaml +++ b/examples/gemma2/reward-model.yaml @@ -18,7 +18,7 @@ remove_unused_columns: false sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/gemma3/gemma-3-1b-qlora.yml b/examples/gemma3/gemma-3-1b-qlora.yml index 99921770d..115717db7 100644 --- a/examples/gemma3/gemma-3-1b-qlora.yml +++ b/examples/gemma3/gemma-3-1b-qlora.yml @@ -35,7 +35,7 @@ lora_target_linear: true sequence_len: 2048 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/gemma3/gemma-3-4b-qlora.yml b/examples/gemma3/gemma-3-4b-qlora.yml index 025cb9240..44ba9c879 100644 --- a/examples/gemma3/gemma-3-4b-qlora.yml +++ b/examples/gemma3/gemma-3-4b-qlora.yml @@ -25,7 +25,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/glm4/qlora-32b.yaml b/examples/glm4/qlora-32b.yaml index 8973cedd4..b3656e3ae 100644 --- a/examples/glm4/qlora-32b.yaml +++ b/examples/glm4/qlora-32b.yaml @@ -17,7 +17,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true -pad_to_sequence_len: true + lora_r: 16 lora_alpha: 32 diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml index fda30e2d2..344f73e63 100644 --- a/examples/jamba/qlora_fsdp_large.yaml +++ b/examples/jamba/qlora_fsdp_large.yaml @@ -23,7 +23,7 @@ save_safetensors: true adapter: qlora sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + lora_r: 16 lora_alpha: 16 diff --git a/examples/lfm2/lfm2-350m-fft.yaml b/examples/lfm2/lfm2-350m-fft.yaml index 74c90c1e1..16a0a028e 100644 --- a/examples/lfm2/lfm2-350m-fft.yaml +++ b/examples/lfm2/lfm2-350m-fft.yaml @@ -18,7 +18,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml index c44cd2230..a23778b96 100644 --- a/examples/llama-2/fft_optimized.yml +++ b/examples/llama-2/fft_optimized.yml @@ -14,7 +14,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora_model_dir: diff --git a/examples/llama-2/lisa.yml b/examples/llama-2/lisa.yml index a44e261be..25adcad5d 100644 --- a/examples/llama-2/lisa.yml +++ b/examples/llama-2/lisa.yml @@ -14,7 +14,7 @@ output_dir: ./outputs/lisa-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora_model_dir: diff --git a/examples/llama-2/loftq.yml b/examples/llama-2/loftq.yml index 085627f63..606bbc735 100644 --- a/examples/llama-2/loftq.yml +++ b/examples/llama-2/loftq.yml @@ -14,7 +14,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index 759fce044..0781e0d1b 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -17,7 +17,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml index 3bf30120b..ceb3ce5d1 100644 --- a/examples/llama-2/qlora-fsdp.yml +++ b/examples/llama-2/qlora-fsdp.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 512 sample_packing: false -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index 09596c71e..1515872e6 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml index ca8b14a1c..6c9e83223 100644 --- a/examples/llama-2/relora.yml +++ b/examples/llama-2/relora.yml @@ -18,7 +18,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 8 lora_alpha: 16 diff --git a/examples/llama-3/3b-qat-fsdp2.yaml b/examples/llama-3/3b-qat-fsdp2.yaml index 08d8ee5c1..d9b96fb96 100644 --- a/examples/llama-3/3b-qat-fsdp2.yaml +++ b/examples/llama-3/3b-qat-fsdp2.yaml @@ -22,7 +22,7 @@ datasets: output_dir: ./outputs/qat_out/ sample_packing: true -pad_to_sequence_len: true + sequence_len: 512 flex_attention: true diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml index e2808935f..b3d990a8b 100644 --- a/examples/llama-3/fft-8b-liger-fsdp.yaml +++ b/examples/llama-3/fft-8b-liger-fsdp.yaml @@ -26,7 +26,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/llama-3/fft-8b.yaml b/examples/llama-3/fft-8b.yaml index 2dfe6d492..e067212b7 100644 --- a/examples/llama-3/fft-8b.yaml +++ b/examples/llama-3/fft-8b.yaml @@ -11,7 +11,7 @@ output_dir: ./outputs/out sequence_len: 8192 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/llama-3/instruct-dpo-lora-8b.yml b/examples/llama-3/instruct-dpo-lora-8b.yml index 10ab2a320..99de56ad3 100644 --- a/examples/llama-3/instruct-dpo-lora-8b.yml +++ b/examples/llama-3/instruct-dpo-lora-8b.yml @@ -37,7 +37,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-3/instruct-lora-8b.yml b/examples/llama-3/instruct-lora-8b.yml index 83b7f9a37..b8baa5b0a 100644 --- a/examples/llama-3/instruct-lora-8b.yml +++ b/examples/llama-3/instruct-lora-8b.yml @@ -28,7 +28,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-3/lora-1b-deduplicate-dpo.yml b/examples/llama-3/lora-1b-deduplicate-dpo.yml index b20dbad84..288e8fd19 100644 --- a/examples/llama-3/lora-1b-deduplicate-dpo.yml +++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml @@ -49,7 +49,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-3/lora-1b-deduplicate-sft.yml b/examples/llama-3/lora-1b-deduplicate-sft.yml index 67e518184..6ce504a0d 100644 --- a/examples/llama-3/lora-1b-deduplicate-sft.yml +++ b/examples/llama-3/lora-1b-deduplicate-sft.yml @@ -22,7 +22,7 @@ dataset_exact_deduplication: true sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-3/lora-1b-kernels.yml b/examples/llama-3/lora-1b-kernels.yml index 92a948c2e..71e569ae0 100644 --- a/examples/llama-3/lora-1b-kernels.yml +++ b/examples/llama-3/lora-1b-kernels.yml @@ -14,7 +14,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + lora_r: 16 lora_alpha: 32 diff --git a/examples/llama-3/lora-1b-ray.yml b/examples/llama-3/lora-1b-ray.yml index 178a1fb89..7b9d15741 100644 --- a/examples/llama-3/lora-1b-ray.yml +++ b/examples/llama-3/lora-1b-ray.yml @@ -15,7 +15,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true -pad_to_sequence_len: true + lora_r: 16 lora_alpha: 32 diff --git a/examples/llama-3/lora-1b-sample-packing-sequentially.yml b/examples/llama-3/lora-1b-sample-packing-sequentially.yml index c4ce3eb0f..9f764e131 100644 --- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml +++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml @@ -24,7 +24,7 @@ sample_packing: true sample_packing_sequentially: true curriculum_sampling: true eval_sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-3/lora-1b.yml b/examples/llama-3/lora-1b.yml index 82085483f..34d540eb7 100644 --- a/examples/llama-3/lora-1b.yml +++ b/examples/llama-3/lora-1b.yml @@ -15,7 +15,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true -pad_to_sequence_len: true + lora_r: 16 lora_alpha: 32 diff --git a/examples/llama-3/lora-8b.yml b/examples/llama-3/lora-8b.yml index c39389755..ca6cd9e97 100644 --- a/examples/llama-3/lora-8b.yml +++ b/examples/llama-3/lora-8b.yml @@ -18,7 +18,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/llama-3/qlora-1b.yml b/examples/llama-3/qlora-1b.yml index 6b76ea8d9..288b7dc6c 100644 --- a/examples/llama-3/qlora-1b.yml +++ b/examples/llama-3/qlora-1b.yml @@ -18,7 +18,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/llama-3/qlora-fsdp-405b.yaml b/examples/llama-3/qlora-fsdp-405b.yaml index 1ee922b59..0f31b5bdc 100644 --- a/examples/llama-3/qlora-fsdp-405b.yaml +++ b/examples/llama-3/qlora-fsdp-405b.yaml @@ -18,7 +18,7 @@ adapter: qlora sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + lora_r: 16 lora_alpha: 16 diff --git a/examples/llama-3/qlora-fsdp-70b.yaml b/examples/llama-3/qlora-fsdp-70b.yaml index 5edd8353a..28387ba1b 100644 --- a/examples/llama-3/qlora-fsdp-70b.yaml +++ b/examples/llama-3/qlora-fsdp-70b.yaml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 512 sample_packing: false -pad_to_sequence_len: true + lora_r: 8 lora_alpha: 16 diff --git a/examples/llama-3/qlora.yml b/examples/llama-3/qlora.yml index a674eca27..ffb00dace 100644 --- a/examples/llama-3/qlora.yml +++ b/examples/llama-3/qlora.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/llama-3/sparse-finetuning.yaml b/examples/llama-3/sparse-finetuning.yaml index 8577a19d2..ecf5df955 100644 --- a/examples/llama-3/sparse-finetuning.yaml +++ b/examples/llama-3/sparse-finetuning.yaml @@ -16,7 +16,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + eval_sample_packing: false wandb_project: diff --git a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml index d4a038e11..3bd05b5ba 100644 --- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml +++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml @@ -47,7 +47,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + gradient_accumulation_steps: 1 micro_batch_size: 1 diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml index bea10d979..1c6ba1410 100644 --- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml @@ -48,7 +48,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml index 737d93812..081089555 100644 --- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml +++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml @@ -51,7 +51,7 @@ output_dir: ./outputs/out sequence_len: 4096 # up to 8k will work on a single H100 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml index b3e8c328c..6193e4ed5 100644 --- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml +++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml @@ -46,7 +46,7 @@ output_dir: ./outputs/out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + gradient_accumulation_steps: 1 micro_batch_size: 2 diff --git a/examples/llama-4/scout-qlora-single-h100-flex.yaml b/examples/llama-4/scout-qlora-single-h100-flex.yaml index 6be3988ef..c3bbfe56a 100644 --- a/examples/llama-4/scout-qlora-single-h100-flex.yaml +++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml @@ -51,7 +51,7 @@ output_dir: ./outputs/out sequence_len: 4096 # up to 8k will work on a single H100 sample_packing: true -pad_to_sequence_len: true + gradient_accumulation_steps: 1 micro_batch_size: 1 diff --git a/examples/magistral/magistral-small-fsdp-qlora.yaml b/examples/magistral/magistral-small-fsdp-qlora.yaml index b23d2309a..4a769510a 100644 --- a/examples/magistral/magistral-small-fsdp-qlora.yaml +++ b/examples/magistral/magistral-small-fsdp-qlora.yaml @@ -23,7 +23,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true eval_sample_packing: false -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/magistral/magistral-small-qlora.yaml b/examples/magistral/magistral-small-qlora.yaml index f0fce014f..bb2e0ccf0 100644 --- a/examples/magistral/magistral-small-qlora.yaml +++ b/examples/magistral/magistral-small-qlora.yaml @@ -22,7 +22,7 @@ lora_model_dir: sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/mistral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral-ds-zero3.yaml index e9bcbb7d6..a8dc36216 100644 --- a/examples/mistral/bigstral-ds-zero3.yaml +++ b/examples/mistral/bigstral-ds-zero3.yaml @@ -27,7 +27,7 @@ output_dir: ./outputs/out sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + gradient_accumulation_steps: 1 micro_batch_size: 1 diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index 8c4d80f79..455c3c224 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -14,7 +14,7 @@ output_dir: ./outputs/out sequence_len: 8192 sample_packing: true -pad_to_sequence_len: true + eval_sample_packing: false wandb_project: diff --git a/examples/mistral/lora-mps.yml b/examples/mistral/lora-mps.yml index d54c3e30b..c18d10aee 100644 --- a/examples/mistral/lora-mps.yml +++ b/examples/mistral/lora-mps.yml @@ -18,7 +18,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/mistral/lora.yml b/examples/mistral/lora.yml index 161255468..77a87a1da 100644 --- a/examples/mistral/lora.yml +++ b/examples/mistral/lora.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 8192 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/mistral/mistral-dpo-qlora.yml b/examples/mistral/mistral-dpo-qlora.yml index 8d0378690..49f5e4ede 100644 --- a/examples/mistral/mistral-dpo-qlora.yml +++ b/examples/mistral/mistral-dpo-qlora.yml @@ -31,7 +31,7 @@ output_dir: ./outputs/dpo-qlora sequence_len: 2048 sample_packing: false -pad_to_sequence_len: true + adapter: qlora lora_model_dir: diff --git a/examples/mistral/mistral-qlora-orpo.yml b/examples/mistral/mistral-qlora-orpo.yml index f37dc09fa..ea3e112b9 100644 --- a/examples/mistral/mistral-qlora-orpo.yml +++ b/examples/mistral/mistral-qlora-orpo.yml @@ -25,7 +25,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: false -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml index 5be9b4db8..933275484 100644 --- a/examples/mistral/mixtral.yml +++ b/examples/mistral/mixtral.yml @@ -34,7 +34,7 @@ lora_model_dir: sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/mistral/mixtral_22.yml b/examples/mistral/mixtral_22.yml index 100e4464f..0b606b7d7 100644 --- a/examples/mistral/mixtral_22.yml +++ b/examples/mistral/mixtral_22.yml @@ -25,7 +25,7 @@ output_dir: ./outputs/out sequence_len: 8000 sample_packing: true -pad_to_sequence_len: true + gradient_accumulation_steps: 1 micro_batch_size: 1 diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index 08df36e15..a5e8b65fb 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -20,7 +20,7 @@ lora_model_dir: sequence_len: 8192 sample_packing: true -pad_to_sequence_len: true + lora_r: 32 lora_alpha: 16 diff --git a/examples/orpheus/finetune.yml b/examples/orpheus/finetune.yml index 57f65d966..9dcb8a43e 100644 --- a/examples/orpheus/finetune.yml +++ b/examples/orpheus/finetune.yml @@ -18,7 +18,7 @@ output_dir: ./outputs/out sequence_len: 8192 sample_packing: true -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/phi/lora-3.5.yaml b/examples/phi/lora-3.5.yaml index 9f3bbdf53..b7f902d63 100644 --- a/examples/phi/lora-3.5.yaml +++ b/examples/phi/lora-3.5.yaml @@ -28,7 +28,7 @@ output_dir: ./outputs/lora-out sequence_len: 4096 sample_packing: false -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml index fc6d649d7..4adb62d3a 100644 --- a/examples/phi/phi-ft.yml +++ b/examples/phi/phi-ft.yml @@ -15,7 +15,7 @@ output_dir: ./outputs/phi-sft-out sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + adapter: lora_model_dir: diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml index ccd92c817..11c08bfe6 100644 --- a/examples/phi/phi-qlora.yml +++ b/examples/phi/phi-qlora.yml @@ -18,7 +18,7 @@ output_dir: ./outputs/phi-sft-out sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + adapter: qlora lora_model_dir: diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml index 853250ccb..102c7ba03 100644 --- a/examples/phi/phi2-ft.yml +++ b/examples/phi/phi2-ft.yml @@ -15,7 +15,7 @@ output_dir: ./outputs/phi-sft-out sequence_len: 2048 sample_packing: true -pad_to_sequence_len: true + adapter: lora_model_dir: diff --git a/examples/phi/phi3-ft-fsdp.yml b/examples/phi/phi3-ft-fsdp.yml index 130298bc0..e8290ea1f 100644 --- a/examples/phi/phi3-ft-fsdp.yml +++ b/examples/phi/phi3-ft-fsdp.yml @@ -15,7 +15,7 @@ output_dir: ./phi-sft-out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + trust_remote_code: true adapter: diff --git a/examples/phi/phi3-ft.yml b/examples/phi/phi3-ft.yml index 42b87e8d0..0b204963c 100644 --- a/examples/phi/phi3-ft.yml +++ b/examples/phi/phi3-ft.yml @@ -18,7 +18,7 @@ output_dir: ./out sequence_len: 4096 sample_packing: true -pad_to_sequence_len: true + adapter: lora lora_model_dir: diff --git a/examples/qwen2/dpo.yaml b/examples/qwen2/dpo.yaml index 69a74ae4a..3b1f817e5 100644 --- a/examples/qwen2/dpo.yaml +++ b/examples/qwen2/dpo.yaml @@ -27,7 +27,7 @@ output_dir: ./outputs/dpo-out sequence_len: 2048 sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/qwen2/prm.yaml b/examples/qwen2/prm.yaml index af188f75d..a709a598d 100644 --- a/examples/qwen2/prm.yaml +++ b/examples/qwen2/prm.yaml @@ -22,7 +22,7 @@ remove_unused_columns: false sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/qwen2/qlora-fsdp.yaml b/examples/qwen2/qlora-fsdp.yaml index 861ce5517..ca435b2bb 100644 --- a/examples/qwen2/qlora-fsdp.yaml +++ b/examples/qwen2/qlora-fsdp.yaml @@ -17,7 +17,7 @@ output_dir: ./outputs/out sequence_len: 2048 sample_packing: true eval_sample_packing: true -pad_to_sequence_len: true + adapter: qlora lora_model_dir: diff --git a/examples/qwen2/reward-model.yaml b/examples/qwen2/reward-model.yaml index 1854b8216..08b8b4552 100644 --- a/examples/qwen2/reward-model.yaml +++ b/examples/qwen2/reward-model.yaml @@ -18,7 +18,7 @@ remove_unused_columns: false sequence_len: 2048 sample_packing: false eval_sample_packing: false -pad_to_sequence_len: true + wandb_project: wandb_entity: diff --git a/examples/qwen3/32b-qlora.yaml b/examples/qwen3/32b-qlora.yaml index 1f148ece5..87609c42f 100644 --- a/examples/qwen3/32b-qlora.yaml +++ b/examples/qwen3/32b-qlora.yaml @@ -22,7 +22,7 @@ dataset_prepared_path: last_run_prepared sequence_len: 2048 sample_packing: true eval_sample_packing: true -pad_to_sequence_len: true + load_in_4bit: true adapter: qlora diff --git a/examples/qwen3/8b-qat-fsdp2.yml b/examples/qwen3/8b-qat-fsdp2.yml index e4d0ed4fb..395812a56 100644 --- a/examples/qwen3/8b-qat-fsdp2.yml +++ b/examples/qwen3/8b-qat-fsdp2.yml @@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out/ sequence_len: 2048 sample_packing: true flex_attention: true -pad_to_sequence_len: true + flex_attn_compile_kwargs: dynamic: false diff --git a/examples/qwen3/qlora-fsdp.yaml b/examples/qwen3/qlora-fsdp.yaml index 762f9648d..6af3cfbc6 100644 --- a/examples/qwen3/qlora-fsdp.yaml +++ b/examples/qwen3/qlora-fsdp.yaml @@ -16,7 +16,7 @@ output_dir: ./outputs/out sequence_len: 2048 sample_packing: true eval_sample_packing: true -pad_to_sequence_len: true + adapter: qlora lora_model_dir: diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py index 96e3a8a3e..de928d11c 100644 --- a/src/axolotl/utils/schemas/config.py +++ b/src/axolotl/utils/schemas/config.py @@ -435,7 +435,7 @@ class AxolotlInputConfig( pad_to_sequence_len: bool | None = Field( default=None, json_schema_extra={ - "description": "Pad inputs so each step uses constant sized buffers. This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently" + "description": "Pad inputs so each step uses constant sized buffers. This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to True if `sample_packing` enabled" }, ) curriculum_sampling: bool | None = Field( diff --git a/tests/utils/schemas/validation/test_default_values.py b/tests/utils/schemas/validation/test_default_values.py new file mode 100644 index 000000000..332dfe77f --- /dev/null +++ b/tests/utils/schemas/validation/test_default_values.py @@ -0,0 +1,21 @@ +"""Tests for default values for configurations""" + +from axolotl.utils.config import validate_config +from axolotl.utils.dict import DictDefault + + +class TestDefaultConfigValues: + """Tests for default values for configurations""" + + def test_pad_to_sequence_len(self, min_base_cfg): + """Tests that sample packing automatically sets pad_to_sequence_len to True""" + cfg = ( + DictDefault( + sample_packing=True, + ) + | min_base_cfg + ) + + cfg = validate_config(cfg) + + assert cfg.pad_to_sequence_len is True