Scheduler implementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (#1273)

2024-02-13 14:23:28 +09:00
parent 4b997c3e1a
commit 8430db22e2
4 changed files with 152 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -813,6 +813,7 @@ early_stopping_patience: 3
 lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
+cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)

 # For one_cycle optim
 lr_div_factor: # Learning rate div factor