feature: loss watchdog for terminating training runs that are failing (#899)

Co-authored-by: Karl-Johan Alm <kalle@gmail.com>
This commit is contained in:
kallewoof
2023-12-04 21:54:34 +09:00
committed by GitHub
parent 476a205cea
commit 58ec8b1113
4 changed files with 40 additions and 0 deletions

View File

@@ -694,6 +694,9 @@ max_steps:
eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
# Save model as safetensors (require safetensors package)
save_safetensors: