LISA (#1469)

* add lisa support * fix default and fix attribute traversal for layers * improve lisa callback logging * fix LISA by ensuring params are not frozen during __init__ * example config for lisa --------- Co-authored-by: Aman Karmani <aman@tmm1.net>
2024-04-01 04:54:53 -07:00
parent 89134f2143
commit 0ddfb24fcf
4 changed files with 208 additions and 0 deletions
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -45,6 +45,7 @@ from axolotl.utils.callbacks import (
    causal_lm_bench_eval_callback_factory,
    log_prediction_callback_factory,
 )
+from axolotl.utils.callbacks.lisa import lisa_callback_factory
 from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
@@ -200,6 +201,18 @@ class AxolotlTrainingArguments(TrainingArguments):
    orpo_alpha: Optional[float] = field(
        default=None,
    )
+    lisa_n_layers: Optional[int] = field(
+        default=None,
+        metadata={"help": "the number of activate layers in LISA"},
+    )
+    lisa_step_interval: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to switch layers in LISA"},
+    )
+    lisa_layers_attribute: Optional[str] = field(
+        default=None,
+        metadata={"help": "path under the model to access the layers"},
+    )


 class AxolotlTrainer(Trainer):
@@ -938,6 +951,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            )
            callbacks.append(early_stop_cb)

+        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
+            callbacks.append(lisa_callback_factory(trainer))
        return callbacks

    def _get_trainer_cls(self):
@@ -1229,6 +1244,15 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                    "relora_prune_ratio"
                ] = self.cfg.relora_prune_ratio

+        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
+            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
+            training_arguments_kwargs[
+                "lisa_step_interval"
+            ] = self.cfg.lisa_step_interval
+            training_arguments_kwargs[
+                "lisa_layers_attribute"
+            ] = self.cfg.lisa_layers_attribute
+
        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
--- a/src/axolotl/utils/callbacks/lisa.py
+++ b/src/axolotl/utils/callbacks/lisa.py
@@ -0,0 +1,91 @@
+"""
+module for LISA
+
+Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl
+Arxiv: https://arxiv.org/abs/2403.17919
+License: Apache 2.0
+"""
+
+import logging
+from functools import reduce
+from typing import TYPE_CHECKING
+
+import numpy as np
+from transformers import TrainerCallback
+
+if TYPE_CHECKING:
+    from axolotl.core.trainer_builder import AxolotlTrainer
+
+LOG = logging.getLogger("axolotl.callbacks.lisa")
+
+
+def lisa_callback_factory(trainer: "AxolotlTrainer"):
+    class LISACallback(TrainerCallback):
+        """trainer callback for lisa layer switching"""
+
+        def __init__(
+            self, n_layers, step_interval, trainer, layers_attribute="model.layers"
+        ):
+            super().__init__()
+            self.n_layers = n_layers
+            self.step_interval = step_interval
+            self.layers_attribute = layers_attribute
+            self.trainer = trainer
+
+            reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
+
+            self.total_layers = len(
+                reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
+            )
+            self.active_layers_indices = []
+
+            layers = reduce(
+                getattr, self.layers_attribute.split("."), self.trainer.model
+            )
+            LOG.info(
+                f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers*100/len(layers)}%) every {self.step_interval} steps"
+            )
+
+        def freeze_all_layers(self):
+            layers = reduce(
+                getattr, self.layers_attribute.split("."), self.trainer.model
+            )
+            for layer in layers:
+                for param in layer.parameters():
+                    param.requires_grad = False
+
+        def on_step_begin(
+            self, args, state, control, **kwargs
+        ):  # pylint: disable=unused-argument
+            # Check if it's time to switch active layers, including at step 0
+            if state.global_step % self.step_interval == 0 or state.global_step == 1:
+                self.switch_active_layers()
+
+        def switch_active_layers(self):
+            # First, disable gradients for all layers
+            self.freeze_all_layers()
+
+            # Randomly select n_layers to activate
+            layers = reduce(
+                getattr, self.layers_attribute.split("."), self.trainer.model
+            )
+            self.active_layers_indices = np.random.choice(
+                range(self.total_layers), self.n_layers, replace=False
+            )
+            LOG.info(
+                f"Activating layers at indices: {self.active_layers_indices} for the next steps."
+            )
+
+            # Enable gradients only for the selected layers
+            for idx in self.active_layers_indices:
+                for param in layers[idx].parameters():
+                    param.requires_grad = True
+
+    lisa_callback = LISACallback(
+        n_layers=trainer.args.lisa_n_layers,
+        step_interval=trainer.args.lisa_step_interval,
+        trainer=trainer,
+        layers_attribute=trainer.args.lisa_layers_attribute,
+    )
+
+    return lisa_callback
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -370,6 +370,23 @@ class MLFlowConfig(BaseModel):
    hf_mlflow_log_artifacts: Optional[bool] = None


+class LISAConfig(BaseModel):
+    """LISA options"""
+
+    lisa_n_layers: Optional[int] = Field(
+        default=None,
+        metadata={"help": "the number of activate layers in LISA"},
+    )
+    lisa_step_interval: Optional[int] = Field(
+        default=None,
+        metadata={"help": "how often to switch layers in LISA"},
+    )
+    lisa_layers_attribute: Optional[str] = Field(
+        default="model.layers",
+        metadata={"help": "path under the model to access the layers"},
+    )
+
+
 class WandbConfig(BaseModel):
    """wandb configuration subset"""

@@ -404,6 +421,7 @@ class AxolotlInputConfig(
    HyperparametersConfig,
    WandbConfig,
    MLFlowConfig,
+    LISAConfig,
    RemappedParameters,
    DeprecatedParameters,
    BaseModel,