simplify the example configs to be more minimal and less daunting (#2486) [skip ci]

* simplify the example configs to be more minimal and less daunting

* drop empty s2_attention from example yamls
This commit is contained in:
Wing Lian
2025-04-04 13:47:26 -04:00
committed by GitHub
parent dd66fb163c
commit 9f824ef76a
101 changed files with 14 additions and 1140 deletions

View File

@@ -8,9 +8,6 @@ tokenizer_type: GPT2Tokenizer
trust_remote_code: true
tokenizer_use_fast: true
tokenizer_legacy: true
load_in_8bit: false
load_in_4bit: false
strict: false
push_dataset_to_hub:
hf_use_auth_token: true
@@ -34,7 +31,6 @@ lora_alpha:
lora_dropout:
lora_target_modules:
lora_target_linear:
lora_fan_in_fan_out:
wandb_project:
wandb_entity:
@@ -58,16 +54,12 @@ learning_rate: 0.000085
train_on_inputs: true
group_by_length: false
bf16: auto
fp16:
tf32: true
gradient_checkpointing: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
sdp_attention:
flash_optimum:
@@ -80,8 +72,6 @@ evals_per_epoch: 4
saves_per_epoch: 1
save_total_limit:
debug:
deepspeed:
weight_decay: 0.1
special_tokens:
pad_token: "<|endoftext|>"

View File

@@ -22,7 +22,6 @@ lora_target_modules:
- c_attn
- c_proj
lora_target_linear:
lora_fan_in_fan_out:
wandb_project:
wandb_entity:
wandb_watch:
@@ -36,15 +35,10 @@ optimizer: paged_adamw_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: true
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention: true
flash_attention:
@@ -53,10 +47,6 @@ gptq_model_v1:
warmup_steps: 10
evals_per_epoch: 4
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.1
fsdp:
fsdp_config:
special_tokens:
pad_token: "<|endoftext|>"