support galore once upstreamed into transformers (#1409)

* support galore once upstreamed into transformers * update module name for llama in readme and fix typing for all linear * bump trl for deprecation fixes from newer transformers * include galore as an extra and install in docker image * fix optim_args type * fix optim_args * update dependencies for galore * add galore to cicd dockerfile
2024-03-19 09:26:35 -04:00
parent 40a88e8c4a
commit dd449c5cd8
7 changed files with 51 additions and 7 deletions
--- a/README.md
+++ b/README.md
@@ -907,7 +907,26 @@ lr_div_factor: # Learning rate div factor
 # - paged_adamw_8bit
 # - paged_lion_32bit
 # - paged_lion_8bit
+# - galore_adamw
+# - galore_adamw_8bit
+# - galore_adafactor
+# - galore_adamw_layerwise
+# - galore_adamw_8bit_layerwise
+# - galore_adafactor_layerwise
 optimizer:
+# Dictionary of arguments to pass to the optimizer
+optim_args:
+# For Galore Optimizers the following optim_args are available
+# rank:  # type: int
+# update_proj_gap  # type: int
+# scale  # type: float
+# proj_type:  # type: str, default = std
+
+# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
+optim_target_modules:
+# - self_attn  # for llama
+# - mlp
+
 # Specify weight decay
 weight_decay:
 # adamw hyperparams