new evals_per_epoch and saves_per_epoch to make things cleaner (#944)

* new evals_per_epoch and saves_per_epoch to make things cleaner

* update per PR feedback
This commit is contained in:
Wing Lian
2023-12-12 15:35:23 -05:00
committed by GitHub
parent f1de29dd1e
commit 5f79b8242f
37 changed files with 102 additions and 70 deletions

View File

@@ -691,9 +691,11 @@ warmup_ratio: 0.05 # cannot use with warmup_steps
learning_rate: 0.00003 learning_rate: 0.00003
lr_quadratic_warmup: lr_quadratic_warmup:
logging_steps: logging_steps:
eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
save_strategy: # Set to `no` to skip checkpoint saves save_strategy: # Set to `no` to skip checkpoint saves
save_steps: # Leave empty to save at each epoch save_steps: # Leave empty to save at each epoch
eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
save_total_limit: # Checkpoints saved at a time save_total_limit: # Checkpoints saved at a time
# Maximum number of iterations to train for. It precedes num_epochs which means that # Maximum number of iterations to train for. It precedes num_epochs which means that
# if both are set, num_epochs will not be guaranteed. # if both are set, num_epochs will not be guaranteed.

View File

@@ -72,8 +72,8 @@ gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 32 warmup_steps: 32
eval_steps: evals_per_epoch: 4
save_steps: saves_per_epoch: 1
save_total_limit: save_total_limit:
debug: debug:

View File

@@ -49,8 +49,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -54,8 +54,8 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,8 +56,8 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,8 +54,8 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,8 +56,8 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,8 +54,8 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,8 +56,8 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -51,8 +51,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 40 warmup_steps: 40
eval_steps: 5 evals_per_epoch: 4
save_steps: 43 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -80,8 +80,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 10
eval_steps: 5 evals_per_epoch: 4
save_steps: 10 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.000001 weight_decay: 0.000001

View File

@@ -51,8 +51,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 40 warmup_steps: 40
eval_steps: 5 evals_per_epoch: 4
save_steps: 43 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -46,8 +46,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -42,8 +42,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20
eval_steps: 110 evals_per_epoch: 4
save_steps: 660 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -58,9 +58,9 @@ flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_steps: 100 warmup_steps: 100
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: #deepspeed/zero2.json # multi-gpu only deepspeed: #deepspeed/zero2.json # multi-gpu only
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -62,8 +62,8 @@ flash_attention:
sdp_attention: sdp_attention:
flash_optimum: flash_optimum:
warmup_steps: 100 warmup_steps: 100
eval_steps: evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -54,10 +54,10 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
eval_table_max_new_tokens: 128 eval_table_max_new_tokens: 128
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,9 +56,9 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -60,8 +60,8 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
save_steps: 50 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,9 +54,9 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,10 +47,10 @@ xformers_attention:
flash_attention: flash_attention:
warmup_steps: 10 warmup_steps: 10
eval_steps: evals_per_epoch: 4
eval_table_size: eval_table_size:
eval_table_max_new_tokens: 128 eval_table_max_new_tokens: 128
save_steps: 0.25 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -46,10 +46,10 @@ xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
eval_table_max_new_tokens: 128 eval_table_max_new_tokens: 128
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -67,10 +67,10 @@ loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_steps: 10
eval_steps: evals_per_epoch: 4
eval_table_size: eval_table_size:
eval_table_max_new_tokens: 128 eval_table_max_new_tokens: 128
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed/zero2.json deepspeed: deepspeed/zero2.json
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -66,10 +66,10 @@ loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
eval_table_max_new_tokens: 128 eval_table_max_new_tokens: 128
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -44,8 +44,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20
eval_steps: 110 evals_per_epoch: 4
save_steps: 660 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -49,8 +49,8 @@ flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -54,8 +54,8 @@ flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -48,8 +48,8 @@ flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -59,8 +59,8 @@ xformers_attention:
flash_attention: flash_attention:
warmup_steps: 100 warmup_steps: 100
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -59,8 +59,8 @@ xformers_attention:
flash_attention: flash_attention:
warmup_steps: 100 warmup_steps: 100
eval_steps: 0.05 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -33,5 +33,5 @@ early_stopping_patience:
resume_from_checkpoint: resume_from_checkpoint:
local_rank: local_rank:
weight_decay: 0.1 weight_decay: 0.1
eval_steps: 0.05 evals_per_epoch: 4
logging_steps: 1 logging_steps: 1

View File

@@ -56,10 +56,10 @@ xformers_attention:
flash_attention: flash_attention:
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
eval_table_max_new_tokens: 128 eval_table_max_new_tokens: 128
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,10 +56,10 @@ xformers_attention:
flash_attention: flash_attention:
warmup_steps: 10 warmup_steps: 10
eval_steps: 0.05 evals_per_epoch: 4
eval_table_size: eval_table_size:
eval_table_max_new_tokens: 128 eval_table_max_new_tokens: 128
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -45,8 +45,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20
eval_steps: 110 evals_per_epoch: 4
save_steps: 660 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -45,8 +45,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 20 warmup_steps: 20
eval_steps: 50 evals_per_epoch: 4
save_steps: saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0 weight_decay: 0

View File

@@ -78,8 +78,8 @@ flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_steps: 10 warmup_steps: 10
eval_steps: 50 evals_per_epoch: 4
save_steps: 50 saves_per_epoch: 1
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -77,6 +77,15 @@ def normalize_config(cfg):
else: else:
cfg.torch_dtype = torch.float32 cfg.torch_dtype = torch.float32
if cfg.saves_per_epoch:
save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
if save_steps < 1.0: # prevent saves on every step
cfg.save_steps = save_steps
if cfg.evals_per_epoch:
eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
if eval_steps < 1.0: # prevent evals on every step
cfg.eval_steps = eval_steps
cfg.dataset_processes = cfg.dataset_processes or os.cpu_count() cfg.dataset_processes = cfg.dataset_processes or os.cpu_count()
if not cfg.base_model_config: if not cfg.base_model_config:
@@ -352,6 +361,27 @@ def validate_config(cfg):
cfg.datasets[idx].type = cfg.datasets[idx].type.replace( cfg.datasets[idx].type = cfg.datasets[idx].type.replace(
"sharegpt_simple", "sharegpt" "sharegpt_simple", "sharegpt"
) )
if cfg.saves_per_epoch and cfg.save_steps:
raise ValueError(
"save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
)
if cfg.saves_per_epoch and cfg.save_strategy and cfg.save_strategy != "steps":
raise ValueError(
"save_strategy must be empty or set to `steps` when used with saves_per_epoch."
)
if cfg.evals_per_epoch and cfg.eval_steps:
raise ValueError(
"eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
)
if (
cfg.evals_per_epoch
and cfg.evaluation_strategy
and cfg.evaluation_strategy != "steps"
):
raise ValueError(
"evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
)
if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps": if cfg.save_strategy and cfg.save_steps and cfg.save_strategy != "steps":
raise ValueError( raise ValueError(
"save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps." "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."