From 9f824ef76af48eff9f69f3a93ede03a6a2b4313a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 4 Apr 2025 13:47:26 -0400 Subject: [PATCH] simplify the example configs to be more minimal and less daunting (#2486) [skip ci] * simplify the example configs to be more minimal and less daunting * drop empty s2_attention from example yamls --- examples/cerebras/btlm-ft.yml | 10 ---------- examples/cerebras/qlora.yml | 10 ---------- examples/code-llama/13b/lora.yml | 12 ------------ examples/code-llama/13b/qlora.yml | 12 ------------ examples/code-llama/34b/lora.yml | 12 ------------ examples/code-llama/34b/qlora.yml | 12 ------------ examples/code-llama/7b/lora.yml | 12 ------------ examples/code-llama/7b/qlora.yml | 12 ------------ examples/cohere/command-r-7b-qlora.yml | 12 ------------ examples/dbrx/16bit-lora.yaml | 11 +---------- examples/dbrx/8bit-lora.yaml | 8 +------- examples/dbrx/fft-ds-zero3.yaml | 11 +---------- examples/deepseek-v2/fft-fsdp-16b.yaml | 11 ----------- examples/deepseek-v2/qlora-fsdp-2_5.yaml | 8 -------- examples/falcon/config-7b-lora.yml | 11 ----------- examples/falcon/config-7b-qlora.yml | 10 ---------- examples/falcon/config-7b.yml | 14 -------------- examples/gemma/qlora.yml | 12 ------------ examples/gemma2/qlora.yml | 12 ------------ examples/gemma2/reward-model.yaml | 14 -------------- examples/gemma3/gemma-3-1b-qlora.yml | 12 ------------ examples/gemma3/gemma-3-4b-qlora.yml | 7 ------- examples/gemma3/gemma-3-4b-vision-qlora.yml | 7 ------- examples/gptj/qlora.yml | 11 ----------- examples/jamba/qlora.yaml | 8 -------- examples/jamba/qlora_deepspeed.yaml | 8 +------- examples/jamba/qlora_fsdp_large.yaml | 2 -- examples/jeopardy-bot/config.yml | 8 -------- examples/llama-2/fft_optimized.yml | 15 +-------------- examples/llama-2/gptq-lora.yml | 10 ---------- examples/llama-2/lisa.yml | 15 --------------- examples/llama-2/loftq.yml | 17 ----------------- examples/llama-2/lora.yml | 14 -------------- examples/llama-2/qlora-fsdp.yml | 11 ----------- examples/llama-2/qlora.yml | 13 ------------- examples/llama-2/relora.yml | 12 ------------ examples/llama-3-vision/lora-11b.yaml | 7 ------- examples/llama-3/fft-8b-liger-fsdp.yaml | 8 -------- examples/llama-3/fft-8b.yaml | 13 ------------- examples/llama-3/instruct-dpo-lora-8b.yml | 14 -------------- examples/llama-3/instruct-lora-8b.yml | 14 -------------- examples/llama-3/lora-1b-deduplicate-dpo.yml | 14 -------------- examples/llama-3/lora-1b-deduplicate-sft.yml | 14 -------------- examples/llama-3/lora-1b-kernels.yml | 14 -------------- examples/llama-3/lora-1b-ray.yml | 14 +------------- .../lora-1b-sample-packing-sequentially.yml | 14 -------------- examples/llama-3/lora-1b.yml | 14 -------------- examples/llama-3/lora-8b.yml | 14 -------------- examples/llama-3/qlora-1b-kto.yaml | 13 ------------- examples/llama-3/qlora-1b.yml | 13 ------------- examples/llama-3/qlora-fsdp-405b.yaml | 3 --- examples/llama-3/qlora-fsdp-70b.yaml | 11 ----------- examples/llama-3/qlora.yml | 13 ------------- examples/llava/lora-7b.yaml | 7 ------- examples/mamba/config.yml | 13 ------------- examples/mistral/bigstral-ds-zero3.yaml | 13 +------------ examples/mistral/config.yml | 15 --------------- examples/mistral/lora-mps.yml | 15 --------------- examples/mistral/lora.yml | 13 ------------- examples/mistral/mistral-dpo-qlora.yml | 14 -------------- examples/mistral/mistral-qlora-fsdp.yml | 11 +---------- examples/mistral/mistral-qlora-orpo.yml | 13 ------------- examples/mistral/mistral-small-3.1-24B-lora.yml | 7 ------- examples/mistral/mixtral-8x22b-qlora-fsdp.yml | 11 +---------- examples/mistral/mixtral-qlora-fsdp.yml | 11 +---------- examples/mistral/mixtral.yml | 13 +------------ examples/mistral/mixtral_22.yml | 13 +------------ examples/mistral/qlora.yml | 13 ------------- examples/mpt-7b/config.yml | 9 --------- examples/openllama-3b/config.yml | 13 ------------- examples/openllama-3b/lora.yml | 11 ----------- examples/openllama-3b/qlora.yml | 11 ----------- examples/phi/lora-3.5.yaml | 13 ------------- examples/phi/phi-ft.yml | 14 -------------- examples/phi/phi-qlora.yml | 11 ----------- examples/phi/phi2-ft.yml | 14 -------------- examples/phi/phi3-ft-fsdp.yml | 12 ------------ examples/phi/phi3-ft.yml | 8 +------- examples/pixtral/lora-12b.yml | 7 ------- examples/pythia-12b/config.yml | 10 ---------- examples/pythia/lora.yml | 4 ---- examples/qwen/lora.yml | 13 ------------- examples/qwen/qlora.yml | 13 ------------- examples/qwen/qwen2-moe-lora.yaml | 14 -------------- examples/qwen/qwen2-moe-qlora.yaml | 11 ----------- examples/qwen2-vl/lora-7b.yaml | 7 ------- examples/qwen2/dpo.yaml | 12 ------------ examples/qwen2/prm.yaml | 14 -------------- examples/qwen2/qlora-fsdp.yaml | 9 --------- examples/qwen2/reward-model.yaml | 14 -------------- examples/redpajama/config-3b.yml | 9 --------- examples/replit-3b/config-lora.yml | 10 ---------- examples/stablelm-2/1.6b/fft.yml | 15 +-------------- examples/stablelm-2/1.6b/lora.yml | 11 ----------- examples/starcoder2/qlora.yml | 12 ------------ examples/tiny-llama/lora-mps.yml | 10 ---------- examples/tiny-llama/lora.yml | 11 ----------- examples/tiny-llama/pretrain.yml | 14 -------------- examples/tiny-llama/qlora.yml | 12 ------------ examples/xgen-7b/xgen-7b-8k-qlora.yml | 8 -------- examples/yi-34B-chat/qlora.yml | 13 ------------- 101 files changed, 14 insertions(+), 1140 deletions(-) diff --git a/examples/cerebras/btlm-ft.yml b/examples/cerebras/btlm-ft.yml index 44be53996..6190714b4 100644 --- a/examples/cerebras/btlm-ft.yml +++ b/examples/cerebras/btlm-ft.yml @@ -8,9 +8,6 @@ tokenizer_type: GPT2Tokenizer trust_remote_code: true tokenizer_use_fast: true tokenizer_legacy: true - -load_in_8bit: false -load_in_4bit: false strict: false push_dataset_to_hub: hf_use_auth_token: true @@ -34,7 +31,6 @@ lora_alpha: lora_dropout: lora_target_modules: lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -58,16 +54,12 @@ learning_rate: 0.000085 train_on_inputs: true group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true sdp_attention: flash_optimum: @@ -80,8 +72,6 @@ evals_per_epoch: 4 saves_per_epoch: 1 save_total_limit: -debug: -deepspeed: weight_decay: 0.1 special_tokens: pad_token: "<|endoftext|>" diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml index 866b4ab58..e74b2d675 100644 --- a/examples/cerebras/qlora.yml +++ b/examples/cerebras/qlora.yml @@ -22,7 +22,6 @@ lora_target_modules: - c_attn - c_proj lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: @@ -36,15 +35,10 @@ optimizer: paged_adamw_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 xformers_attention: true flash_attention: @@ -53,10 +47,6 @@ gptq_model_v1: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: pad_token: "<|endoftext|>" diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml index 2b8a720b2..6c205ae87 100644 --- a/examples/code-llama/13b/lora.yml +++ b/examples/code-llama/13b/lora.yml @@ -26,7 +26,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -41,29 +40,18 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml index 92aa6ac97..28f0275d3 100644 --- a/examples/code-llama/13b/qlora.yml +++ b/examples/code-llama/13b/qlora.yml @@ -26,9 +26,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +41,18 @@ optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml index af343e389..6024ce3f7 100644 --- a/examples/code-llama/34b/lora.yml +++ b/examples/code-llama/34b/lora.yml @@ -26,7 +26,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -41,29 +40,18 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml index f45e9205f..56c276cc9 100644 --- a/examples/code-llama/34b/qlora.yml +++ b/examples/code-llama/34b/qlora.yml @@ -26,9 +26,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +41,18 @@ optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml index 6c385dbcb..0eb20c244 100644 --- a/examples/code-llama/7b/lora.yml +++ b/examples/code-llama/7b/lora.yml @@ -26,7 +26,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -41,29 +40,18 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml index ccd256406..f078f1398 100644 --- a/examples/code-llama/7b/qlora.yml +++ b/examples/code-llama/7b/qlora.yml @@ -26,9 +26,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +41,18 @@ optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/cohere/command-r-7b-qlora.yml b/examples/cohere/command-r-7b-qlora.yml index 2ac5c4c09..8a2b6eacd 100644 --- a/examples/cohere/command-r-7b-qlora.yml +++ b/examples/cohere/command-r-7b-qlora.yml @@ -44,28 +44,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/dbrx/16bit-lora.yaml b/examples/dbrx/16bit-lora.yaml index 645ba1d59..1724c1426 100644 --- a/examples/dbrx/16bit-lora.yaml +++ b/examples/dbrx/16bit-lora.yaml @@ -3,9 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2 # hub_model_id: username/custom_model_name trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -48,26 +45,20 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: false # don't use with fsdp_activation_checkpointing gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: saves_per_epoch: 1 -debug: + weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/dbrx/8bit-lora.yaml b/examples/dbrx/8bit-lora.yaml index 4b9f60756..308483adf 100644 --- a/examples/dbrx/8bit-lora.yaml +++ b/examples/dbrx/8bit-lora.yaml @@ -48,26 +48,20 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: false # don't use with fsdp_activation_checkpointing gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: saves_per_epoch: 1 -debug: + weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/dbrx/fft-ds-zero3.yaml b/examples/dbrx/fft-ds-zero3.yaml index e42b63670..0fbb5b068 100644 --- a/examples/dbrx/fft-ds-zero3.yaml +++ b/examples/dbrx/fft-ds-zero3.yaml @@ -3,9 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2 # hub_model_id: username/custom_model_name trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -35,25 +32,19 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: saves_per_epoch: 1 -debug: + weight_decay: 0.0 deepspeed: deepspeed_configs/zero3_bf16.json diff --git a/examples/deepseek-v2/fft-fsdp-16b.yaml b/examples/deepseek-v2/fft-fsdp-16b.yaml index 649317494..3fe8691a3 100644 --- a/examples/deepseek-v2/fft-fsdp-16b.yaml +++ b/examples/deepseek-v2/fft-fsdp-16b.yaml @@ -2,9 +2,6 @@ base_model: deepseek-ai/DeepSeek-V2-Lite # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -31,27 +28,19 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 2 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 special_tokens: fsdp: diff --git a/examples/deepseek-v2/qlora-fsdp-2_5.yaml b/examples/deepseek-v2/qlora-fsdp-2_5.yaml index 005a0c76f..a554970b6 100644 --- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml +++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml @@ -52,27 +52,19 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 2 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 special_tokens: fsdp: diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml index efbe38d4a..2d9240e8b 100644 --- a/examples/falcon/config-7b-lora.yml +++ b/examples/falcon/config-7b-lora.yml @@ -25,9 +25,7 @@ max_packed_sequence_len: lora_r: 16 lora_alpha: 32 lora_dropout: 0.0 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: @@ -41,15 +39,10 @@ optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.00003 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 xformers_attention: true flash_attention: @@ -58,11 +51,7 @@ gptq_model_v1: warmup_steps: 40 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|endoftext|>" bos_token: "<|endoftext|>" diff --git a/examples/falcon/config-7b-qlora.yml b/examples/falcon/config-7b-qlora.yml index b9829db5f..78323db5f 100644 --- a/examples/falcon/config-7b-qlora.yml +++ b/examples/falcon/config-7b-qlora.yml @@ -38,9 +38,7 @@ lora_alpha: 16 # 0.05 for 33B and 65B models lora_dropout: 0.05 # add LoRA modules on all linear layers of the base model -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -67,10 +65,7 @@ lr_scheduler: cosine # - 2e-4 for 7b & 13b # - 1e-4 for 33b & 64b learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true # stop training after this many evaluation losses have increased in a row @@ -78,7 +73,6 @@ gradient_checkpointing: true early_stopping_patience: 3 resume_from_checkpoint: auto_resume_from_checkpoints: true -local_rank: logging_steps: 1 xformers_attention: true flash_attention: @@ -87,11 +81,7 @@ gptq_model_v1: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.000001 -fsdp: -fsdp_config: special_tokens: pad_token: "<|endoftext|>" bos_token: "<|endoftext|>" diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml index 5e41a1e33..a796b89dd 100644 --- a/examples/falcon/config-7b.yml +++ b/examples/falcon/config-7b.yml @@ -7,9 +7,6 @@ tokenizer_type: AutoTokenizer # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false gptq: false strict: false push_dataset_to_hub: @@ -25,9 +22,7 @@ max_packed_sequence_len: lora_r: 64 lora_alpha: 32 lora_dropout: 0.0 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: @@ -41,15 +36,10 @@ optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.00003 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 xformers_attention: true flash_attention: @@ -58,11 +48,7 @@ gptq_model_v1: warmup_steps: 40 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|endoftext|>" bos_token: "<|endoftext|>" diff --git a/examples/gemma/qlora.yml b/examples/gemma/qlora.yml index 80a9fe62f..505564269 100644 --- a/examples/gemma/qlora.yml +++ b/examples/gemma/qlora.yml @@ -42,28 +42,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/gemma2/qlora.yml b/examples/gemma2/qlora.yml index 2f18cc76d..afba83552 100644 --- a/examples/gemma2/qlora.yml +++ b/examples/gemma2/qlora.yml @@ -48,28 +48,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/gemma2/reward-model.yaml b/examples/gemma2/reward-model.yaml index ada42ec28..d828af939 100644 --- a/examples/gemma2/reward-model.yaml +++ b/examples/gemma2/reward-model.yaml @@ -5,9 +5,6 @@ num_labels: 1 tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false reward_model: true @@ -38,8 +35,6 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true @@ -47,21 +42,12 @@ tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/gemma3/gemma-3-1b-qlora.yml b/examples/gemma3/gemma-3-1b-qlora.yml index 8852b3469..732b914a8 100644 --- a/examples/gemma3/gemma-3-1b-qlora.yml +++ b/examples/gemma3/gemma-3-1b-qlora.yml @@ -50,30 +50,18 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/gemma3/gemma-3-4b-qlora.yml b/examples/gemma3/gemma-3-4b-qlora.yml index b4be17c3c..85e5dce68 100644 --- a/examples/gemma3/gemma-3-4b-qlora.yml +++ b/examples/gemma3/gemma-3-4b-qlora.yml @@ -44,8 +44,6 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true @@ -53,7 +51,6 @@ tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -local_rank: logging_steps: 1 flash_attention: true eager_attention: @@ -61,8 +58,4 @@ eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/gemma3/gemma-3-4b-vision-qlora.yml b/examples/gemma3/gemma-3-4b-vision-qlora.yml index 6e711d6f6..92273380c 100644 --- a/examples/gemma3/gemma-3-4b-vision-qlora.yml +++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml @@ -46,8 +46,6 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true @@ -55,7 +53,6 @@ tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -local_rank: logging_steps: 1 flash_attention: true eager_attention: @@ -63,8 +60,4 @@ eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml index ddd6d24c0..086d425b5 100644 --- a/examples/gptj/qlora.yml +++ b/examples/gptj/qlora.yml @@ -18,9 +18,7 @@ max_packed_sequence_len: lora_r: 8 lora_alpha: 32 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: @@ -34,15 +32,10 @@ optimizer: paged_adamw_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 xformers_attention: true flash_attention: @@ -51,10 +44,6 @@ gptq_model_v1: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: pad_token: "<|endoftext|>" diff --git a/examples/jamba/qlora.yaml b/examples/jamba/qlora.yaml index cab62513c..7d642cb0a 100644 --- a/examples/jamba/qlora.yaml +++ b/examples/jamba/qlora.yaml @@ -40,26 +40,18 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 special_tokens: diff --git a/examples/jamba/qlora_deepspeed.yaml b/examples/jamba/qlora_deepspeed.yaml index 7ac7bfac5..d983dc391 100644 --- a/examples/jamba/qlora_deepspeed.yaml +++ b/examples/jamba/qlora_deepspeed.yaml @@ -39,26 +39,20 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: saves_per_epoch: 1 -debug: + deepspeed: deepspeed_configs/zero2.json weight_decay: 0.0 special_tokens: diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml index 18c0ec086..a968d99d7 100644 --- a/examples/jamba/qlora_fsdp_large.yaml +++ b/examples/jamba/qlora_fsdp_large.yaml @@ -39,8 +39,6 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: true tf32: true diff --git a/examples/jeopardy-bot/config.yml b/examples/jeopardy-bot/config.yml index 04f92a8dc..3609bd97e 100644 --- a/examples/jeopardy-bot/config.yml +++ b/examples/jeopardy-bot/config.yml @@ -33,13 +33,9 @@ optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.00003 -train_on_inputs: false -group_by_length: false bf16: auto tf32: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 5 xformers_attention: true flash_attention: @@ -48,11 +44,7 @@ gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: tokens: bos_token: "" eos_token: "" diff --git a/examples/llama-2/fft_optimized.yml b/examples/llama-2/fft_optimized.yml index 3475fcd9a..fd78fbfee 100644 --- a/examples/llama-2/fft_optimized.yml +++ b/examples/llama-2/fft_optimized.yml @@ -4,9 +4,6 @@ model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -26,7 +23,6 @@ lora_r: lora_alpha: lora_dropout: lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -41,18 +37,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true @@ -61,11 +51,8 @@ flash_attn_fuse_mlp: true warmup_steps: 100 evals_per_epoch: 4 -eval_table_size: saves_per_epoch: 1 -debug: + deepspeed: #deepspeed_configs/zero2.json # multi-gpu only weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/llama-2/gptq-lora.yml b/examples/llama-2/gptq-lora.yml index 7d6b90ee3..ad2dbd9cf 100644 --- a/examples/llama-2/gptq-lora.yml +++ b/examples/llama-2/gptq-lora.yml @@ -10,8 +10,6 @@ gptq_disable_exllama: true tokenizer_use_fast: true tokenizer_legacy: true -load_in_8bit: false -load_in_4bit: false strict: false push_dataset_to_hub: hf_use_auth_token: true @@ -33,7 +31,6 @@ lora_target_modules: - q_proj - v_proj lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_watch: wandb_name: @@ -50,26 +47,19 @@ torchdistx_path: lr_scheduler: cosine lr_quadratic_warmup: true learning_rate: 0.000017 -train_on_inputs: false -group_by_length: false bf16: false fp16: false float16: true tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: sdp_attention: flash_optimum: warmup_steps: 100 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 special_tokens: bos_token: "" diff --git a/examples/llama-2/lisa.yml b/examples/llama-2/lisa.yml index 40391204c..585fa9428 100644 --- a/examples/llama-2/lisa.yml +++ b/examples/llama-2/lisa.yml @@ -4,9 +4,6 @@ model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -26,7 +23,6 @@ lora_r: lora_alpha: lora_dropout: lora_target_linear: -lora_fan_in_fan_out: lisa_n_layers: 4 lisa_step_interval: 20 @@ -45,18 +41,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 5e-5 # recommendation from lisa paper for 7b -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true @@ -65,13 +55,8 @@ flash_attn_fuse_mlp: true warmup_steps: 100 evals_per_epoch: 4 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/llama-2/loftq.yml b/examples/llama-2/loftq.yml index a5108e70f..bf32c7b27 100644 --- a/examples/llama-2/loftq.yml +++ b/examples/llama-2/loftq.yml @@ -4,9 +4,6 @@ model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -26,7 +23,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: peft: loftq_config: loftq_bits: 4 @@ -44,29 +40,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/llama-2/lora.yml b/examples/llama-2/lora.yml index ec0c80012..3ef607ab4 100644 --- a/examples/llama-2/lora.yml +++ b/examples/llama-2/lora.yml @@ -26,7 +26,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -41,29 +40,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml index c2db26b81..759f08024 100644 --- a/examples/llama-2/qlora-fsdp.yml +++ b/examples/llama-2/qlora-fsdp.yml @@ -26,9 +26,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +41,19 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/llama-2/qlora.yml b/examples/llama-2/qlora.yml index 81d1acbec..c678a0042 100644 --- a/examples/llama-2/qlora.yml +++ b/examples/llama-2/qlora.yml @@ -26,9 +26,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,27 +41,16 @@ optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/llama-2/relora.yml b/examples/llama-2/relora.yml index 93247ce06..6c943d009 100644 --- a/examples/llama-2/relora.yml +++ b/examples/llama-2/relora.yml @@ -24,9 +24,7 @@ pad_to_sequence_len: true lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: relora_steps: 150 relora_warmup_steps: 10 @@ -45,28 +43,18 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/llama-3-vision/lora-11b.yaml b/examples/llama-3-vision/lora-11b.yaml index 22dc3a9af..4431878fa 100644 --- a/examples/llama-3-vision/lora-11b.yaml +++ b/examples/llama-3-vision/lora-11b.yaml @@ -45,14 +45,11 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true gradient_checkpointing: true -local_rank: logging_steps: 1 flash_attention: true eager_attention: @@ -60,8 +57,4 @@ eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/llama-3/fft-8b-liger-fsdp.yaml b/examples/llama-3/fft-8b-liger-fsdp.yaml index 652a00e5c..50169879c 100644 --- a/examples/llama-3/fft-8b-liger-fsdp.yaml +++ b/examples/llama-3/fft-8b-liger-fsdp.yaml @@ -42,27 +42,19 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 2e-5 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 2 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/llama-3/fft-8b.yaml b/examples/llama-3/fft-8b.yaml index a129c6e5b..4452a6e3d 100644 --- a/examples/llama-3/fft-8b.yaml +++ b/examples/llama-3/fft-8b.yaml @@ -1,9 +1,6 @@ base_model: NousResearch/Meta-Llama-3.1-8B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -30,29 +27,19 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 2e-5 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 2 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: <|end_of_text|> diff --git a/examples/llama-3/instruct-dpo-lora-8b.yml b/examples/llama-3/instruct-dpo-lora-8b.yml index c7568dd78..a1b923fb6 100644 --- a/examples/llama-3/instruct-dpo-lora-8b.yml +++ b/examples/llama-3/instruct-dpo-lora-8b.yml @@ -42,7 +42,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -57,28 +56,15 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/llama-3/instruct-lora-8b.yml b/examples/llama-3/instruct-lora-8b.yml index 8b1c254cb..362bda9aa 100644 --- a/examples/llama-3/instruct-lora-8b.yml +++ b/examples/llama-3/instruct-lora-8b.yml @@ -37,7 +37,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -52,30 +51,17 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: <|end_of_text|> diff --git a/examples/llama-3/lora-1b-deduplicate-dpo.yml b/examples/llama-3/lora-1b-deduplicate-dpo.yml index 62d698559..e4b2a5244 100644 --- a/examples/llama-3/lora-1b-deduplicate-dpo.yml +++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml @@ -58,7 +58,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -73,28 +72,15 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/llama-3/lora-1b-deduplicate-sft.yml b/examples/llama-3/lora-1b-deduplicate-sft.yml index bc748807b..b8c21fafb 100644 --- a/examples/llama-3/lora-1b-deduplicate-sft.yml +++ b/examples/llama-3/lora-1b-deduplicate-sft.yml @@ -31,7 +31,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_modules_to_save: - embed_tokens - lm_head @@ -49,30 +48,17 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: <|end_of_text|> diff --git a/examples/llama-3/lora-1b-kernels.yml b/examples/llama-3/lora-1b-kernels.yml index 9c47f266f..b76f03801 100644 --- a/examples/llama-3/lora-1b-kernels.yml +++ b/examples/llama-3/lora-1b-kernels.yml @@ -1,9 +1,6 @@ base_model: NousResearch/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -24,7 +21,6 @@ lora_r: 16 lora_alpha: 32 # Currently, we don't support dropout with our custom Triton kernels # lora_dropout: 0.05 -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -53,18 +49,12 @@ optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -73,10 +63,6 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|end_of_text|>" diff --git a/examples/llama-3/lora-1b-ray.yml b/examples/llama-3/lora-1b-ray.yml index 0e597a204..199fe3b5d 100644 --- a/examples/llama-3/lora-1b-ray.yml +++ b/examples/llama-3/lora-1b-ray.yml @@ -1,9 +1,6 @@ base_model: NousResearch/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -24,7 +21,6 @@ pad_to_sequence_len: true lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -47,18 +43,12 @@ optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -67,11 +57,9 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: + deepspeed: deepspeed_configs/zero3.json weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|end_of_text|>" diff --git a/examples/llama-3/lora-1b-sample-packing-sequentially.yml b/examples/llama-3/lora-1b-sample-packing-sequentially.yml index 79f5a2ba1..a027673ab 100644 --- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml +++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml @@ -33,7 +33,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_modules_to_save: - embed_tokens - lm_head @@ -51,30 +50,17 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: <|end_of_text|> diff --git a/examples/llama-3/lora-1b.yml b/examples/llama-3/lora-1b.yml index a1c3afa87..8a536260a 100644 --- a/examples/llama-3/lora-1b.yml +++ b/examples/llama-3/lora-1b.yml @@ -1,9 +1,6 @@ base_model: NousResearch/Llama-3.2-1B # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -24,7 +21,6 @@ pad_to_sequence_len: true lora_r: 16 lora_alpha: 32 lora_dropout: 0.05 -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -47,18 +43,12 @@ optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -67,10 +57,6 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|end_of_text|>" diff --git a/examples/llama-3/lora-8b.yml b/examples/llama-3/lora-8b.yml index 7921857ce..700dd4614 100644 --- a/examples/llama-3/lora-8b.yml +++ b/examples/llama-3/lora-8b.yml @@ -27,7 +27,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_modules_to_save: - embed_tokens - lm_head @@ -45,30 +44,17 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: <|end_of_text|> diff --git a/examples/llama-3/qlora-1b-kto.yaml b/examples/llama-3/qlora-1b-kto.yaml index 32168dc37..0dc37b40a 100644 --- a/examples/llama-3/qlora-1b-kto.yaml +++ b/examples/llama-3/qlora-1b-kto.yaml @@ -32,7 +32,6 @@ lora_r: 32 lora_alpha: 64 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -47,31 +46,19 @@ optimizer: adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 20 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|end_of_text|>" diff --git a/examples/llama-3/qlora-1b.yml b/examples/llama-3/qlora-1b.yml index 226bbb237..c42dd2238 100644 --- a/examples/llama-3/qlora-1b.yml +++ b/examples/llama-3/qlora-1b.yml @@ -24,7 +24,6 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -47,18 +46,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -66,13 +59,7 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|end_of_text|>" diff --git a/examples/llama-3/qlora-fsdp-405b.yaml b/examples/llama-3/qlora-fsdp-405b.yaml index 434ee67c4..75c8f5973 100644 --- a/examples/llama-3/qlora-fsdp-405b.yaml +++ b/examples/llama-3/qlora-fsdp-405b.yaml @@ -24,7 +24,6 @@ pad_to_sequence_len: true lora_r: 16 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true gradient_accumulation_steps: 4 @@ -34,8 +33,6 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: true tf32: true diff --git a/examples/llama-3/qlora-fsdp-70b.yaml b/examples/llama-3/qlora-fsdp-70b.yaml index ceb2d8567..c4889d643 100644 --- a/examples/llama-3/qlora-fsdp-70b.yaml +++ b/examples/llama-3/qlora-fsdp-70b.yaml @@ -26,9 +26,7 @@ pad_to_sequence_len: true lora_r: 8 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +41,19 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/llama-3/qlora.yml b/examples/llama-3/qlora.yml index 64268a205..607deb896 100644 --- a/examples/llama-3/qlora.yml +++ b/examples/llama-3/qlora.yml @@ -26,9 +26,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +41,17 @@ optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: "<|end_of_text|>" diff --git a/examples/llava/lora-7b.yaml b/examples/llava/lora-7b.yaml index 9129d0122..68e463585 100644 --- a/examples/llava/lora-7b.yaml +++ b/examples/llava/lora-7b.yaml @@ -41,14 +41,11 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true gradient_checkpointing: true -local_rank: logging_steps: 1 flash_attention: true eager_attention: @@ -56,8 +53,4 @@ eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml index ca96fbfc3..3a114bac7 100644 --- a/examples/mamba/config.yml +++ b/examples/mamba/config.yml @@ -5,9 +5,6 @@ tokenizer_type: AutoTokenizer tokenizer_config: EleutherAI/gpt-neox-20b # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -38,27 +35,17 @@ train_on_inputs: false group_by_length: true bf16: auto -fp16: tf32: true gradient_checkpointing: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: tokens: save_safetensors: False diff --git a/examples/mistral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral-ds-zero3.yaml index 5ee214c1b..bef989932 100644 --- a/examples/mistral/bigstral-ds-zero3.yaml +++ b/examples/mistral/bigstral-ds-zero3.yaml @@ -6,9 +6,6 @@ tokenizer_type: LlamaTokenizer # hub_model_id: username/custom_model_name trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false strict: false unfrozen_parameters: @@ -40,27 +37,19 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true save_total_limit: 1 save_steps: -debug: + deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: eos_token: "<|im_end|>" tokens: diff --git a/examples/mistral/config.yml b/examples/mistral/config.yml index 890203339..c58e2c954 100644 --- a/examples/mistral/config.yml +++ b/examples/mistral/config.yml @@ -4,9 +4,6 @@ model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -34,28 +31,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.000005 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/mistral/lora-mps.yml b/examples/mistral/lora-mps.yml index a62990e56..ba61cac11 100644 --- a/examples/mistral/lora-mps.yml +++ b/examples/mistral/lora-mps.yml @@ -4,9 +4,6 @@ model_type: MistralForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -28,7 +25,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -51,18 +47,13 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto fp16: false tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: false sdp_attention: true @@ -71,12 +62,6 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_table_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/mistral/lora.yml b/examples/mistral/lora.yml index 11c1e0ee7..b55c66715 100644 --- a/examples/mistral/lora.yml +++ b/examples/mistral/lora.yml @@ -27,7 +27,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -50,18 +49,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -69,12 +62,6 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/mistral/mistral-dpo-qlora.yml b/examples/mistral/mistral-dpo-qlora.yml index 168fd3c3b..6446cacb8 100644 --- a/examples/mistral/mistral-dpo-qlora.yml +++ b/examples/mistral/mistral-dpo-qlora.yml @@ -40,7 +40,6 @@ lora_r: 8 lora_alpha: 16 lora_dropout: 0.2 lora_target_linear: true -lora_fan_in_fan_out: lora_target_modules: - gate_proj @@ -67,31 +66,18 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: false -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: bos_token: "<|im_start|>" eos_token: "<|im_end|>" diff --git a/examples/mistral/mistral-qlora-fsdp.yml b/examples/mistral/mistral-qlora-fsdp.yml index 521f4de5f..5825ac749 100644 --- a/examples/mistral/mistral-qlora-fsdp.yml +++ b/examples/mistral/mistral-qlora-fsdp.yml @@ -32,7 +32,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -47,18 +46,12 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -66,10 +59,8 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: + weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/mistral/mistral-qlora-orpo.yml b/examples/mistral/mistral-qlora-orpo.yml index 82f30dc17..9c6ae74ef 100644 --- a/examples/mistral/mistral-qlora-orpo.yml +++ b/examples/mistral/mistral-qlora-orpo.yml @@ -32,7 +32,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -55,18 +54,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -74,12 +67,6 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/mistral/mistral-small-3.1-24B-lora.yml b/examples/mistral/mistral-small-3.1-24B-lora.yml index 177484799..0e6b4402d 100644 --- a/examples/mistral/mistral-small-3.1-24B-lora.yml +++ b/examples/mistral/mistral-small-3.1-24B-lora.yml @@ -43,14 +43,11 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true gradient_checkpointing: true -local_rank: logging_steps: 1 flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet. eager_attention: @@ -58,9 +55,5 @@ eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml index 353c08d85..e29b6392a 100644 --- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml +++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml @@ -30,7 +30,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -45,18 +44,12 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -64,10 +57,8 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: + weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/mistral/mixtral-qlora-fsdp.yml b/examples/mistral/mixtral-qlora-fsdp.yml index f9b5ab606..40bb5d5d1 100644 --- a/examples/mistral/mixtral-qlora-fsdp.yml +++ b/examples/mistral/mixtral-qlora-fsdp.yml @@ -32,7 +32,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -47,18 +46,12 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -66,10 +59,8 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: + weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral.yml index ac80ec933..eefd2456d 100644 --- a/examples/mistral/mixtral.yml +++ b/examples/mistral/mixtral.yml @@ -41,7 +41,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: #lora_target_modules: # - gate # - q_proj @@ -65,18 +64,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -84,12 +77,8 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: + deepspeed: deepspeed_configs/zero2.json weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/mistral/mixtral_22.yml b/examples/mistral/mixtral_22.yml index 7f2a72212..1debd793a 100644 --- a/examples/mistral/mixtral_22.yml +++ b/examples/mistral/mixtral_22.yml @@ -6,9 +6,6 @@ tokenizer_type: LlamaTokenizer # hub_model_id: username/custom_model_name trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false strict: false unfrozen_parameters: @@ -38,27 +35,19 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0001 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true save_total_limit: 1 save_steps: -debug: + deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: eos_token: "<|im_end|>" tokens: diff --git a/examples/mistral/qlora.yml b/examples/mistral/qlora.yml index 5f3fa10b8..921f3fe7b 100644 --- a/examples/mistral/qlora.yml +++ b/examples/mistral/qlora.yml @@ -27,7 +27,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_target_modules: - gate_proj - down_proj @@ -50,18 +49,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true loss_watchdog_threshold: 5.0 @@ -69,12 +62,6 @@ loss_watchdog_patience: 3 warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/mpt-7b/config.yml b/examples/mpt-7b/config.yml index cf4b433fe..e7485fad7 100644 --- a/examples/mpt-7b/config.yml +++ b/examples/mpt-7b/config.yml @@ -35,26 +35,17 @@ optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0000002 -train_on_inputs: false -group_by_length: false bf16: auto tf32: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 5 -xformers_attention: flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0001 -fsdp: -fsdp_config: tokens: pad_token: "<|padding|>" bos_token: "<|endoftext|>" diff --git a/examples/openllama-3b/config.yml b/examples/openllama-3b/config.yml index ec66014b4..7a1e2d9a5 100644 --- a/examples/openllama-3b/config.yml +++ b/examples/openllama-3b/config.yml @@ -4,9 +4,6 @@ model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false push_dataset_to_hub: datasets: @@ -23,7 +20,6 @@ lora_alpha: lora_dropout: lora_target_modules: lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: @@ -37,29 +33,20 @@ optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.000003 -train_on_inputs: false -group_by_length: false float16: true bf16: false fp16: false tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/openllama-3b/lora.yml b/examples/openllama-3b/lora.yml index b449df9ae..c1c597b73 100644 --- a/examples/openllama-3b/lora.yml +++ b/examples/openllama-3b/lora.yml @@ -29,7 +29,6 @@ lora_target_modules: - v_proj - k_proj - o_proj -lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: @@ -43,29 +42,19 @@ optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: false fp16: true tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true gptq_groupsize: -s2_attention: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/openllama-3b/qlora.yml b/examples/openllama-3b/qlora.yml index 3efcdabc6..e9c71efd1 100644 --- a/examples/openllama-3b/qlora.yml +++ b/examples/openllama-3b/qlora.yml @@ -21,9 +21,7 @@ sample_packing: true lora_r: 8 lora_alpha: 32 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: wandb_watch: @@ -37,28 +35,19 @@ optimizer: paged_adamw_32bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: false fp16: true tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: bos_token: "" eos_token: "" diff --git a/examples/phi/lora-3.5.yaml b/examples/phi/lora-3.5.yaml index ddc2d7788..2ecb9d28d 100644 --- a/examples/phi/lora-3.5.yaml +++ b/examples/phi/lora-3.5.yaml @@ -37,7 +37,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -52,28 +51,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bfloat16: true bf16: true fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: -s2_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 4 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/phi/phi-ft.yml b/examples/phi/phi-ft.yml index 29fad3094..886671a60 100644 --- a/examples/phi/phi-ft.yml +++ b/examples/phi/phi-ft.yml @@ -4,9 +4,6 @@ model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -27,7 +24,6 @@ lora_r: lora_alpha: lora_dropout: lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -45,30 +41,20 @@ max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: True -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: resize_token_embeddings_to_32x: true special_tokens: pad_token: "<|endoftext|>" diff --git a/examples/phi/phi-qlora.yml b/examples/phi/phi-qlora.yml index d9f23ff26..a1cbf8a52 100644 --- a/examples/phi/phi-qlora.yml +++ b/examples/phi/phi-qlora.yml @@ -27,7 +27,6 @@ lora_r: 64 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -45,30 +44,20 @@ max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: True -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: resize_token_embeddings_to_32x: true special_tokens: pad_token: "<|endoftext|>" diff --git a/examples/phi/phi2-ft.yml b/examples/phi/phi2-ft.yml index 1b7ac89ec..480017a39 100644 --- a/examples/phi/phi2-ft.yml +++ b/examples/phi/phi2-ft.yml @@ -4,9 +4,6 @@ model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -27,7 +24,6 @@ lora_r: lora_alpha: lora_dropout: lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -45,30 +41,20 @@ max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: True -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 -fsdp: -fsdp_config: resize_token_embeddings_to_32x: true special_tokens: pad_token: "<|endoftext|>" diff --git a/examples/phi/phi3-ft-fsdp.yml b/examples/phi/phi3-ft-fsdp.yml index 1479bb97f..766db76f6 100644 --- a/examples/phi/phi3-ft-fsdp.yml +++ b/examples/phi/phi3-ft-fsdp.yml @@ -4,9 +4,6 @@ model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -28,7 +25,6 @@ lora_r: lora_alpha: lora_dropout: lora_target_linear: -lora_fan_in_fan_out: wandb_project: phi3 wandb_entity: @@ -46,27 +42,19 @@ max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 0.000003 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 100 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.1 fsdp: - full_shard diff --git a/examples/phi/phi3-ft.yml b/examples/phi/phi3-ft.yml index 58afd940e..0a5da2e23 100644 --- a/examples/phi/phi3-ft.yml +++ b/examples/phi/phi3-ft.yml @@ -7,9 +7,6 @@ tokenizer_type: AutoTokenizer # hub_model_id: username/custom_model_name chat_template: phi_3 - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -30,7 +27,6 @@ lora_r: 64 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: gradient_accumulation_steps: 1 micro_batch_size: 2 @@ -42,8 +38,6 @@ max_grad_norm: 1.0 lr_scheduler: cosine learning_rate: 5.0e-6 -train_on_inputs: false -group_by_length: false bf16: auto gradient_checkpointing: true @@ -55,9 +49,9 @@ flash_attention: true eval_steps: 1000 save_steps: 5000 -eval_table_size: 2 eval_batch_size: 2 eval_sample_packing: false +eval_table_size: 2 eval_max_new_tokens: 32 eval_causal_lm_metrics: ["perplexity"] do_causal_lm_eval: true diff --git a/examples/pixtral/lora-12b.yml b/examples/pixtral/lora-12b.yml index 7336a7ad0..d3b3efd70 100644 --- a/examples/pixtral/lora-12b.yml +++ b/examples/pixtral/lora-12b.yml @@ -41,14 +41,11 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true gradient_checkpointing: true -local_rank: logging_steps: 1 flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet eager_attention: @@ -56,10 +53,6 @@ eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: pad_token: diff --git a/examples/pythia-12b/config.yml b/examples/pythia-12b/config.yml index 52ab77055..170d6ac59 100644 --- a/examples/pythia-12b/config.yml +++ b/examples/pythia-12b/config.yml @@ -5,9 +5,6 @@ model_type: GPTNeoXForCausalLM tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false gptq: false device_map: auto datasets: @@ -22,7 +19,6 @@ max_packed_sequence_len: 2048 lora_r: 64 lora_alpha: 32 lora_dropout: 0.0 -lora_target_modules: lora_target_linear: true lora_fan_in_fan_out: true # pythia/GPTNeoX lora specific wandb_project: @@ -37,16 +33,10 @@ num_epochs: 5 learning_rate: 0.00003 optimizer: adamw_bnb_8bit lr_scheduler: cosine -train_on_inputs: false -group_by_length: false bf16: false fp16: false float16: true tf32: true flash_optimum: true -early_stopping_patience: resume_from_checkpoint: -local_rank: gradient_checkpointing: true -fsdp: -fsdp_config: diff --git a/examples/pythia/lora.yml b/examples/pythia/lora.yml index 203652f6b..0549967ec 100644 --- a/examples/pythia/lora.yml +++ b/examples/pythia/lora.yml @@ -28,13 +28,9 @@ gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 4 learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: auto tf32: true -early_stopping_patience: resume_from_checkpoint: -local_rank: weight_decay: 0.1 evals_per_epoch: 4 logging_steps: 1 diff --git a/examples/qwen/lora.yml b/examples/qwen/lora.yml index 961125a51..23385d236 100644 --- a/examples/qwen/lora.yml +++ b/examples/qwen/lora.yml @@ -28,7 +28,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +42,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/qwen/qlora.yml b/examples/qwen/qlora.yml index e7159eaa5..854a682fe 100644 --- a/examples/qwen/qlora.yml +++ b/examples/qwen/qlora.yml @@ -28,7 +28,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,28 +42,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/qwen/qwen2-moe-lora.yaml b/examples/qwen/qwen2-moe-lora.yaml index b357b9344..a2a1e4d25 100644 --- a/examples/qwen/qwen2-moe-lora.yaml +++ b/examples/qwen/qwen2-moe-lora.yaml @@ -3,9 +3,6 @@ base_model: Qwen/Qwen1.5-MoE-A2.7B # hub_model_id: username/custom_model_name trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -25,7 +22,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -40,28 +36,18 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/qwen/qwen2-moe-qlora.yaml b/examples/qwen/qwen2-moe-qlora.yaml index d45e4c89f..b1ab131a6 100644 --- a/examples/qwen/qwen2-moe-qlora.yaml +++ b/examples/qwen/qwen2-moe-qlora.yaml @@ -25,7 +25,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -40,28 +39,18 @@ optimizer: paged_adamw_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/qwen2-vl/lora-7b.yaml b/examples/qwen2-vl/lora-7b.yaml index e7ab13ddb..3d0b10adf 100644 --- a/examples/qwen2-vl/lora-7b.yaml +++ b/examples/qwen2-vl/lora-7b.yaml @@ -41,14 +41,11 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true gradient_checkpointing: true -local_rank: logging_steps: 1 flash_attention: true eager_attention: @@ -56,8 +53,4 @@ eager_attention: warmup_ratio: 0.1 evals_per_epoch: 1 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/qwen2/dpo.yaml b/examples/qwen2/dpo.yaml index d6dbe22d7..8df7f9dc4 100644 --- a/examples/qwen2/dpo.yaml +++ b/examples/qwen2/dpo.yaml @@ -44,27 +44,15 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: diff --git a/examples/qwen2/prm.yaml b/examples/qwen2/prm.yaml index 071e2d0f3..669f8e2db 100644 --- a/examples/qwen2/prm.yaml +++ b/examples/qwen2/prm.yaml @@ -5,9 +5,6 @@ num_labels: 2 tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false process_reward_model: true @@ -43,30 +40,19 @@ optimizer: adamw_torch lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: -eval_table_size: -eval_max_new_tokens: 128 eval_steps: 100 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/qwen2/qlora-fsdp.yaml b/examples/qwen2/qlora-fsdp.yaml index c537d3244..1f2ed83b1 100644 --- a/examples/qwen2/qlora-fsdp.yaml +++ b/examples/qwen2/qlora-fsdp.yaml @@ -26,7 +26,6 @@ lora_r: 32 lora_alpha: 64 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -41,27 +40,19 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 fsdp: - full_shard diff --git a/examples/qwen2/reward-model.yaml b/examples/qwen2/reward-model.yaml index bbd6e66ce..fcbb9867e 100644 --- a/examples/qwen2/reward-model.yaml +++ b/examples/qwen2/reward-model.yaml @@ -5,9 +5,6 @@ num_labels: 1 tokenizer_type: AutoTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false reward_model: true @@ -38,8 +35,6 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: true fp16: tf32: true @@ -47,21 +42,12 @@ tf32: true gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_ratio: 0.1 evals_per_epoch: -eval_table_size: -eval_max_new_tokens: 128 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/redpajama/config-3b.yml b/examples/redpajama/config-3b.yml index d716727a3..3e2999df9 100644 --- a/examples/redpajama/config-3b.yml +++ b/examples/redpajama/config-3b.yml @@ -36,26 +36,17 @@ optimizer: adamw_bnb_8bit torchdistx_path: lr_scheduler: cosine learning_rate: 0.0000002 -train_on_inputs: false -group_by_length: false bf16: auto tf32: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 5 -xformers_attention: flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0001 -fsdp: -fsdp_config: tokens: pad_token: "<|padding|>" bos_token: "<|endoftext|>" diff --git a/examples/replit-3b/config-lora.yml b/examples/replit-3b/config-lora.yml index bb2a6aace..5a02ba10c 100644 --- a/examples/replit-3b/config-lora.yml +++ b/examples/replit-3b/config-lora.yml @@ -20,7 +20,6 @@ lora_target_modules: - Wqkv - mlp_up - mlp_down -lora_fan_in_fan_out: wandb_project: lora-replit wandb_entity: wandb_watch: @@ -34,25 +33,16 @@ optimizer: torchdistx_path: lr_scheduler: learning_rate: 0.00001 -train_on_inputs: false -group_by_length: false bf16: auto tf32: true gradient_checkpointing: -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: gptq_groupsize: gptq_model_v1: warmup_steps: 20 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0 -fsdp: -fsdp_config: #special_tokens: diff --git a/examples/stablelm-2/1.6b/fft.yml b/examples/stablelm-2/1.6b/fft.yml index 3ecb1581b..f26714856 100644 --- a/examples/stablelm-2/1.6b/fft.yml +++ b/examples/stablelm-2/1.6b/fft.yml @@ -6,9 +6,6 @@ tokenizer_type: AutoTokenizer # hub_model_id: username/custom_model_name trust_remote_code: true - -load_in_8bit: false -load_in_4bit: false strict: false datasets: @@ -28,7 +25,6 @@ lora_r: lora_alpha: lora_dropout: lora_target_linear: -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,18 +39,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true @@ -63,11 +53,8 @@ flash_attn_fuse_mlp: true warmup_steps: 100 evals_per_epoch: 4 -eval_table_size: saves_per_epoch: 1 -debug: + deepspeed: #deepspeed_configs/zero2.json # multi-gpu only weight_decay: 0.1 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/stablelm-2/1.6b/lora.yml b/examples/stablelm-2/1.6b/lora.yml index 8597de6a2..aaa9908d1 100644 --- a/examples/stablelm-2/1.6b/lora.yml +++ b/examples/stablelm-2/1.6b/lora.yml @@ -28,7 +28,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -43,18 +42,12 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true flash_attn_cross_entropy: false flash_attn_rms_norm: true @@ -62,9 +55,5 @@ flash_attn_rms_norm: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/starcoder2/qlora.yml b/examples/starcoder2/qlora.yml index d1db71d6d..8b21d7145 100644 --- a/examples/starcoder2/qlora.yml +++ b/examples/starcoder2/qlora.yml @@ -25,9 +25,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -42,30 +40,20 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 2e-5 -train_on_inputs: false -group_by_length: false bf16: auto fp16: false tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 20 evals_per_epoch: 4 eval_steps: -eval_table_size: saves_per_epoch: 4 save_steps: save_total_limit: 2 -debug: -deepspeed: weight_decay: -fsdp: -fsdp_config: special_tokens: diff --git a/examples/tiny-llama/lora-mps.yml b/examples/tiny-llama/lora-mps.yml index c777a4d7b..8654a39bb 100644 --- a/examples/tiny-llama/lora-mps.yml +++ b/examples/tiny-llama/lora-mps.yml @@ -27,7 +27,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -42,26 +41,17 @@ optimizer: adamw_torch_fused lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto fp16: false tf32: true gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: false warmup_steps: 10 evals_per_epoch: 0 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/tiny-llama/lora.yml b/examples/tiny-llama/lora.yml index 54aa5ec27..64b2360d3 100644 --- a/examples/tiny-llama/lora.yml +++ b/examples/tiny-llama/lora.yml @@ -26,7 +26,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -41,26 +40,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/tiny-llama/pretrain.yml b/examples/tiny-llama/pretrain.yml index fd6d2c9c1..2984c52ae 100644 --- a/examples/tiny-llama/pretrain.yml +++ b/examples/tiny-llama/pretrain.yml @@ -4,9 +4,6 @@ model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name - -load_in_8bit: false -load_in_4bit: false strict: false max_steps: 200 @@ -34,27 +31,16 @@ optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: -eval_table_size: saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/tiny-llama/qlora.yml b/examples/tiny-llama/qlora.yml index 694ab3a15..79e3164a5 100644 --- a/examples/tiny-llama/qlora.yml +++ b/examples/tiny-llama/qlora.yml @@ -27,9 +27,7 @@ pad_to_sequence_len: true lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -44,26 +42,16 @@ optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 0.0002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: flash_attention: true warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 -fsdp: -fsdp_config: special_tokens: diff --git a/examples/xgen-7b/xgen-7b-8k-qlora.yml b/examples/xgen-7b/xgen-7b-8k-qlora.yml index d798e326d..f4ff589e0 100644 --- a/examples/xgen-7b/xgen-7b-8k-qlora.yml +++ b/examples/xgen-7b/xgen-7b-8k-qlora.yml @@ -36,9 +36,7 @@ lora_alpha: 16 # 0.05 for 33B and 65B models lora_dropout: 0.05 # add LoRA modules on all linear layers of the base model -lora_target_modules: lora_target_linear: true -lora_fan_in_fan_out: wandb_project: wandb_entity: @@ -65,10 +63,7 @@ lr_scheduler: cosine # - 2e-4 for 7b & 13b # - 1e-4 for 33b & 64b learning_rate: 0.00002 -train_on_inputs: false -group_by_length: false bf16: auto -fp16: tf32: false gradient_checkpointing: true # stop training after this many evaluation losses have increased in a row @@ -76,7 +71,6 @@ gradient_checkpointing: true early_stopping_patience: 3 resume_from_checkpoint: auto_resume_from_checkpoints: true -local_rank: logging_steps: 1 xformers_attention: true flash_attention: @@ -85,8 +79,6 @@ gptq_model_v1: warmup_steps: 10 evals_per_epoch: 4 saves_per_epoch: 1 -debug: -deepspeed: weight_decay: 0.0 special_tokens: eos_token: "<|endoftext|>" diff --git a/examples/yi-34B-chat/qlora.yml b/examples/yi-34B-chat/qlora.yml index b68d00883..de79ed6ce 100644 --- a/examples/yi-34B-chat/qlora.yml +++ b/examples/yi-34B-chat/qlora.yml @@ -10,7 +10,6 @@ load_in_4bit: true strict: false sequence_len: 1024 bf16: auto -fp16: tf32: false flash_attention: true special_tokens: @@ -30,8 +29,6 @@ num_epochs: 1 # Evaluation val_set_size: 0.1 evals_per_epoch: 5 -eval_table_size: -eval_max_new_tokens: 128 eval_sample_packing: false eval_batch_size: 1 @@ -43,7 +40,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 lora_target_linear: true -lora_fan_in_fan_out: lora_target_modules: # Sampling @@ -64,15 +60,6 @@ lr_scheduler: cosine learning_rate: 0.0002 # Misc -train_on_inputs: false -group_by_length: false -early_stopping_patience: resume_from_checkpoint: -local_rank: logging_steps: 1 -xformers_attention: -debug: -deepspeed: weight_decay: 0 -fsdp: -fsdp_config: