Scheduler implementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (#1273)

This commit is contained in:
김진원
2024-02-13 14:23:28 +09:00
committed by GitHub
parent 4b997c3e1a
commit 8430db22e2
4 changed files with 152 additions and 2 deletions

View File

@@ -813,6 +813,7 @@ early_stopping_patience: 3
lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
lr_scheduler_kwargs:
cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
# For one_cycle optim
lr_div_factor: # Learning rate div factor