support galore once upstreamed into transformers (#1409)
* support galore once upstreamed into transformers * update module name for llama in readme and fix typing for all linear * bump trl for deprecation fixes from newer transformers * include galore as an extra and install in docker image * fix optim_args type * fix optim_args * update dependencies for galore * add galore to cicd dockerfile
This commit is contained in:
19
README.md
19
README.md
@@ -907,7 +907,26 @@ lr_div_factor: # Learning rate div factor
|
||||
# - paged_adamw_8bit
|
||||
# - paged_lion_32bit
|
||||
# - paged_lion_8bit
|
||||
# - galore_adamw
|
||||
# - galore_adamw_8bit
|
||||
# - galore_adafactor
|
||||
# - galore_adamw_layerwise
|
||||
# - galore_adamw_8bit_layerwise
|
||||
# - galore_adafactor_layerwise
|
||||
optimizer:
|
||||
# Dictionary of arguments to pass to the optimizer
|
||||
optim_args:
|
||||
# For Galore Optimizers the following optim_args are available
|
||||
# rank: # type: int
|
||||
# update_proj_gap # type: int
|
||||
# scale # type: float
|
||||
# proj_type: # type: str, default = std
|
||||
|
||||
# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
|
||||
optim_target_modules:
|
||||
# - self_attn # for llama
|
||||
# - mlp
|
||||
|
||||
# Specify weight decay
|
||||
weight_decay:
|
||||
# adamw hyperparams
|
||||
|
||||
Reference in New Issue
Block a user