From 86be9f329e46281f9a5a93ea1585908f3ae95981 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 23 Mar 2026 02:26:10 -0400 Subject: [PATCH] post merge lora fixes for CI (#3536) [skip ci] * post merge lora fixes for CI * handle lora kernel auto-enable for moe without grouped_mm * prefer not to import torch in schema validation --- src/axolotl/utils/schemas/config.py | 33 +++++++++++++++++++++++++ tests/e2e/kernels/test_lora.py | 37 ++++++++++++++++++++++++++--- 2 files changed, 67 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py index 2f269b78e..34fd9ba2c 100644 --- a/src/axolotl/utils/schemas/config.py +++ b/src/axolotl/utils/schemas/config.py @@ -1385,6 +1385,39 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig): if data.get("trust_remote_code"): return data + # Skip auto-enable for MoE models when native grouped_mm is unavailable + # (torch < 2.9). The grouped_mm fallback in transformers uses torch.mm + # with out= which bypasses autocast and fails on mixed dtypes during eval. + env_capabilities = data.get("env_capabilities", {}) + torch_version = env_capabilities.get("torch_version") + if torch_version is None: + import torch + + torch_version = str(torch.__version__).split("+", maxsplit=1)[0] + has_grouped_mm = version.parse(torch_version) >= version.parse("2.9.0") + if not has_grouped_mm: + is_moe = False + model_type = data.get("model_config_type", "") + if model_type and "moe" in model_type.lower(): + is_moe = True + if not is_moe: + try: + from transformers import AutoConfig + + base_model = data.get("base_model") + if base_model: + auto_cfg = AutoConfig.from_pretrained( + base_model, trust_remote_code=False + ) + if getattr(auto_cfg, "num_local_experts", None) or getattr( + auto_cfg, "num_experts", None + ): + is_moe = True + except Exception: # pylint: disable=broad-exception-caught + pass + if is_moe: + return data + # Check multi-GPU compatibility capabilities = data.get("capabilities") is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1 diff --git a/tests/e2e/kernels/test_lora.py b/tests/e2e/kernels/test_lora.py index 568524557..10850bdc8 100644 --- a/tests/e2e/kernels/test_lora.py +++ b/tests/e2e/kernels/test_lora.py @@ -176,24 +176,31 @@ def test_lora_mlp_direct(sample_tensors, activation_forward, activation_backward X.requires_grad = True output = LoRA_MLP.apply( X, + None, # X_drop gate_proj.weight, gate_proj.bias, None, # gate_quant None, # gate_A None, # gate_B None, # gate_scale + None, # gate_lora_bias + None, # gate_magnitude up_proj.weight, up_proj.bias, None, # up_quant None, # up_A None, # up_B None, # up_scale + None, # up_lora_bias + None, # up_magnitude down_proj.weight, down_proj.bias, None, # down_quant None, # down_A None, # down_B None, # down_scale + None, # down_lora_bias + None, # down_magnitude activation_forward, activation_backward, True, # inplace @@ -247,24 +254,31 @@ def test_lora_mlp_with_adapters( # Forward pass with adapters output = LoRA_MLP.apply( X, + None, # X_drop gate_proj.weight, gate_proj.bias, None, gate_A, gate_B, scale, + None, # gate_lora_bias + None, # gate_magnitude up_proj.weight, up_proj.bias, None, up_A, up_B, scale, + None, # up_lora_bias + None, # up_magnitude down_proj.weight, down_proj.bias, None, down_A, down_B, scale, + None, # down_lora_bias + None, # down_magnitude activation_forward, activation_backward, True, @@ -334,25 +348,32 @@ def test_lora_qkv(sample_tensors): Q1, K1, V1 = LoRA_QKV.apply( X, + None, # X_drop q_weight, None, None, None, None, None, + None, + None, # Q: weight, bias, quant, A, B, scale, lora_bias, magnitude k_weight, None, None, None, None, None, + None, + None, # K v_weight, None, None, None, None, None, - True, + None, + None, # V + True, # inplace ) assert Q1.shape == K1.shape == V1.shape == X.shape @@ -366,25 +387,32 @@ def test_lora_qkv(sample_tensors): # Test with LoRA adapters Q2, K2, V2 = LoRA_QKV.apply( X, + None, # X_drop q_weight, None, None, q_A, q_B, scale, + None, + None, # Q k_weight, None, None, k_A, k_B, scale, + None, + None, # K v_weight, None, None, v_A, v_B, scale, - True, + None, + None, # V + True, # inplace ) assert Q2.shape == K2.shape == V2.shape == X.shape @@ -427,7 +455,9 @@ def test_lora_o(sample_tensors): # Test forward pass X.requires_grad = True - output = LoRA_O.apply(X, W, b, None, A, B, scale) + output = LoRA_O.apply( + X, None, W, b, None, A, B, scale, None, None + ) # X_drop, ..., lora_bias, magnitude assert output.shape == (X.shape[0], X.shape[1], W.shape[0]) @@ -542,6 +572,7 @@ def test_inplace_operations(sample_tensors, apply_function): "down_proj": nn.Linear(shapes["out"], shapes["hidden"]).to( device="cuda", dtype=torch.float16 ), + "training": False, }, )