From 86be9f329e46281f9a5a93ea1585908f3ae95981 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Mon, 23 Mar 2026 02:26:10 -0400
Subject: [PATCH] post merge lora fixes for CI (#3536) [skip ci]

* post merge lora fixes for CI

* handle lora kernel auto-enable for moe without grouped_mm

* prefer not to import torch in schema validation
---
 src/axolotl/utils/schemas/config.py | 33 +++++++++++++++++++++++++
 tests/e2e/kernels/test_lora.py      | 37 ++++++++++++++++++++++++++---
 2 files changed, 67 insertions(+), 3 deletions(-)

diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 2f269b78e..34fd9ba2c 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -1385,6 +1385,39 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
             if data.get("trust_remote_code"):
                 return data
 
+            # Skip auto-enable for MoE models when native grouped_mm is unavailable
+            # (torch < 2.9). The grouped_mm fallback in transformers uses torch.mm
+            # with out= which bypasses autocast and fails on mixed dtypes during eval.
+            env_capabilities = data.get("env_capabilities", {})
+            torch_version = env_capabilities.get("torch_version")
+            if torch_version is None:
+                import torch
+
+                torch_version = str(torch.__version__).split("+", maxsplit=1)[0]
+            has_grouped_mm = version.parse(torch_version) >= version.parse("2.9.0")
+            if not has_grouped_mm:
+                is_moe = False
+                model_type = data.get("model_config_type", "")
+                if model_type and "moe" in model_type.lower():
+                    is_moe = True
+                if not is_moe:
+                    try:
+                        from transformers import AutoConfig
+
+                        base_model = data.get("base_model")
+                        if base_model:
+                            auto_cfg = AutoConfig.from_pretrained(
+                                base_model, trust_remote_code=False
+                            )
+                            if getattr(auto_cfg, "num_local_experts", None) or getattr(
+                                auto_cfg, "num_experts", None
+                            ):
+                                is_moe = True
+                    except Exception:  # pylint: disable=broad-exception-caught
+                        pass
+                if is_moe:
+                    return data
+
             # Check multi-GPU compatibility
             capabilities = data.get("capabilities")
             is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1
diff --git a/tests/e2e/kernels/test_lora.py b/tests/e2e/kernels/test_lora.py
index 568524557..10850bdc8 100644
--- a/tests/e2e/kernels/test_lora.py
+++ b/tests/e2e/kernels/test_lora.py
@@ -176,24 +176,31 @@ def test_lora_mlp_direct(sample_tensors, activation_forward, activation_backward
     X.requires_grad = True
     output = LoRA_MLP.apply(
         X,
+        None,  # X_drop
         gate_proj.weight,
         gate_proj.bias,
         None,  # gate_quant
         None,  # gate_A
         None,  # gate_B
         None,  # gate_scale
+        None,  # gate_lora_bias
+        None,  # gate_magnitude
         up_proj.weight,
         up_proj.bias,
         None,  # up_quant
         None,  # up_A
         None,  # up_B
         None,  # up_scale
+        None,  # up_lora_bias
+        None,  # up_magnitude
         down_proj.weight,
         down_proj.bias,
         None,  # down_quant
         None,  # down_A
         None,  # down_B
         None,  # down_scale
+        None,  # down_lora_bias
+        None,  # down_magnitude
         activation_forward,
         activation_backward,
         True,  # inplace
@@ -247,24 +254,31 @@ def test_lora_mlp_with_adapters(
     # Forward pass with adapters
     output = LoRA_MLP.apply(
         X,
+        None,  # X_drop
         gate_proj.weight,
         gate_proj.bias,
         None,
         gate_A,
         gate_B,
         scale,
+        None,  # gate_lora_bias
+        None,  # gate_magnitude
         up_proj.weight,
         up_proj.bias,
         None,
         up_A,
         up_B,
         scale,
+        None,  # up_lora_bias
+        None,  # up_magnitude
         down_proj.weight,
         down_proj.bias,
         None,
         down_A,
         down_B,
         scale,
+        None,  # down_lora_bias
+        None,  # down_magnitude
         activation_forward,
         activation_backward,
         True,
@@ -334,25 +348,32 @@ def test_lora_qkv(sample_tensors):
 
     Q1, K1, V1 = LoRA_QKV.apply(
         X,
+        None,  # X_drop
         q_weight,
         None,
         None,
         None,
         None,
         None,
+        None,
+        None,  # Q: weight, bias, quant, A, B, scale, lora_bias, magnitude
         k_weight,
         None,
         None,
         None,
         None,
         None,
+        None,
+        None,  # K
         v_weight,
         None,
         None,
         None,
         None,
         None,
-        True,
+        None,
+        None,  # V
+        True,  # inplace
     )
 
     assert Q1.shape == K1.shape == V1.shape == X.shape
@@ -366,25 +387,32 @@ def test_lora_qkv(sample_tensors):
     # Test with LoRA adapters
     Q2, K2, V2 = LoRA_QKV.apply(
         X,
+        None,  # X_drop
         q_weight,
         None,
         None,
         q_A,
         q_B,
         scale,
+        None,
+        None,  # Q
         k_weight,
         None,
         None,
         k_A,
         k_B,
         scale,
+        None,
+        None,  # K
         v_weight,
         None,
         None,
         v_A,
         v_B,
         scale,
-        True,
+        None,
+        None,  # V
+        True,  # inplace
     )
 
     assert Q2.shape == K2.shape == V2.shape == X.shape
@@ -427,7 +455,9 @@ def test_lora_o(sample_tensors):
 
     # Test forward pass
     X.requires_grad = True
-    output = LoRA_O.apply(X, W, b, None, A, B, scale)
+    output = LoRA_O.apply(
+        X, None, W, b, None, A, B, scale, None, None
+    )  # X_drop, ..., lora_bias, magnitude
 
     assert output.shape == (X.shape[0], X.shape[1], W.shape[0])
 
@@ -542,6 +572,7 @@ def test_inplace_operations(sample_tensors, apply_function):
             "down_proj": nn.Linear(shapes["out"], shapes["hidden"]).to(
                 device="cuda", dtype=torch.float16
             ),
+            "training": False,
         },
     )