activation offloading with cuda streams doesn't work with LoRA (#2927)

2025-07-16 11:59:20 -04:00
parent 2c408b5c5e
commit 36cbe13d18
2 changed files with 115 additions and 11 deletions
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -1066,23 +1066,23 @@ class ModelCompatibilityValidationMixin:
            raise ValueError("gradient_checkpointing is not supported for MPT models")
        return self

-    @model_validator(mode="after")
-    def check_offload_grad_checkpointing(self):
-        if self.gradient_checkpointing and self.gradient_checkpointing == "unsloth":
-            LOG.warning(
-                "`unsloth` is deprecated for gradient_checkpointing, use `offload`"
-            )
-            self.gradient_checkpointing = "offload"
-        return self
-
    @model_validator(mode="after")
    def check_gradient_checkpointing_w_offload(self):
        if self.gradient_checkpointing == "offload":
            LOG.warning(
-                "`offload` is deprecated for gradient_checkpointing, use `activation_offloading: true`"
+                "`offload` is deprecated for gradient_checkpointing, use `activation_offloading: true` or `activation_offloading: legacy`"
            )
            self.gradient_checkpointing = True
-            self.activation_offloading = True
+            if self.adapter and "lora" in self.adapter:
+                LOG.warning(
+                    "offloading with CUDA streams is not supported for LoRA adapters, using the `activation_offloading: legacy` implementation."
+                )
+                self.activation_offloading = "legacy"
+            else:
+                LOG.warning(
+                    "`offload` uses a new stream implementation; to use the previous implementation, use `activation_offloading: legacy`"
+                )
+                self.activation_offloading = True
        if self.gradient_checkpointing == "offload_disk":
            LOG.warning(
                "`offload_disk` is deprecated for gradient_checkpointing, use `activation_offloading: disk`"
@@ -1091,6 +1091,19 @@ class ModelCompatibilityValidationMixin:
            self.activation_offloading = "disk"
        return self

+    @model_validator(mode="after")
+    def check_activation_offloading_w_lora(self):
+        if (
+            self.activation_offloading is True
+            and self.adapter
+            and "lora" in self.adapter
+        ):
+            LOG.warning(
+                "activation_offloading with CUDA streams is not supported for LoRA adapters. Setting `activation_offloading: legacy`"
+            )
+            self.activation_offloading = "legacy"
+        return self
+
    @model_validator(mode="after")
    def check_activation_offloading_wo_gc(self):
        if self.activation_offloading and not self.gradient_checkpointing: