feature: loss watchdog for terminating training runs that are failing (#899)

Co-authored-by: Karl-Johan Alm <kalle@gmail.com>
This commit is contained in:
kallewoof
2023-12-04 21:54:34 +09:00
committed by GitHub
parent 476a205cea
commit 58ec8b1113
4 changed files with 40 additions and 0 deletions

View File

@@ -62,6 +62,9 @@ logging_steps: 1
xformers_attention:
flash_attention: true
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3
warmup_steps: 10
eval_steps: 0.05
eval_table_size: