diff --git a/README.md b/README.md index d6c9cfefb..5fbac1a48 100644 --- a/README.md +++ b/README.md @@ -422,6 +422,12 @@ log_sweep_max_lr: optimizer: # specify weight decay weight_decay: +# adamw hyperparams +adam_beta1: +adam_beta2: +adam_epsilon: +# Gradient clipping max norm +max_grad_norm: # whether to bettertransformers flash_optimum: