axolotl/tests/e2e/integrations/test_sonicmoe.py

"""
End-to-end gradient and convergence tests for SonicMoE integration.

Requires:
    - H100/H200 GPU (SonicMoE CUTLASS kernels target sm_90)
    - sonicmoe package installed
    - transformers with Qwen3MoE support

Usage:
    pytest tests/e2e/integrations/test_sonicmoe.py -v -s
"""

import importlib.util
import math

import pytest
import torch

_sonicmoe_available = importlib.util.find_spec("sonicmoe") is not None
_is_hopper = torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0)

pytestmark = [
    pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA GPU"),
    pytest.mark.skipif(
        not _is_hopper, reason="SonicMoE CUTLASS kernels require Hopper (sm_90)"
    ),
    pytest.mark.skipif(not _sonicmoe_available, reason="SonicMoE not installed"),
]


def _create_tiny_qwen3_config():
    """Create a minimal Qwen3MoE config for fast testing."""
    from transformers import AutoConfig

    config = AutoConfig.for_model("qwen3_moe")
    config.hidden_size = 512
    config.intermediate_size = 1024
    config.moe_intermediate_size = 64
    config.num_attention_heads = 16
    config.num_key_value_heads = 2
    config.head_dim = 32
    config.num_hidden_layers = 2
    config.num_experts = 8
    config.num_experts_per_tok = 2
    config.vocab_size = 1000
    config.max_position_embeddings = 128
    config.norm_topk_prob = True
    config.torch_dtype = torch.bfloat16
    return config


def _interleave_gate_up_weights(model):
    """Interleave all gate_up_proj parameters in-place for SonicMoE."""
    from axolotl.integrations.kernels.sonicmoe.weight_converter import (
        interleave_gate_up,
    )

    with torch.no_grad():
        for name, param in model.named_parameters():
            if "gate_up_proj" in name:
                param.copy_(interleave_gate_up(param))


def _unpatch_sonicmoe():
    """Restore original forward on the MoE block class if it was patched."""
    from axolotl.integrations.kernels.constants import resolve_moe_block_classes

    for moe_cls in resolve_moe_block_classes("qwen3_moe"):
        if hasattr(moe_cls, "_original_forward"):
            moe_cls.forward = moe_cls._original_forward
            del moe_cls._original_forward


class TestSonicMoEForwardCorrectness:
    """Verify SonicMoE-patched model produces same output as original."""

    def teardown_method(self):
        _unpatch_sonicmoe()

    def test_forward_output_matches(self):
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")

        # Original model
        model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16()

        with torch.no_grad():
            out_orig = model_orig(input_ids)

        # Patched model (same weights, interleaved for SonicMoE)
        model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        model_patched.load_state_dict(model_orig.state_dict())

        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model_patched)

        with torch.no_grad():
            out_patched = model_patched(input_ids)

        max_diff = (out_orig.logits - out_patched.logits).abs().max().item()
        assert torch.allclose(
            out_orig.logits, out_patched.logits, atol=1e-1, rtol=1e-1
        ), f"Output mismatch: max diff={max_diff:.6f}"


class TestSonicMoEGradientCorrectness:
    """Compare gradients between original HuggingFace and SonicMoE-patched forward."""

    def teardown_method(self):
        _unpatch_sonicmoe()

    def test_gradients_match(self):
        """Verify all parameter gradients match between original and patched."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe
        from axolotl.integrations.kernels.sonicmoe.weight_converter import (
            deinterleave_gate_up,
        )

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")

        # ---------- Original model ----------
        model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        out_orig = model_orig(input_ids, labels=input_ids)
        out_orig.loss.backward()
        grads_orig = {
            n: p.grad.float().clone()
            for n, p in model_orig.named_parameters()
            if p.grad is not None
        }
        loss_orig = out_orig.loss.item()

        # ---------- SonicMoE-patched model (same weights, interleaved) ----------
        model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        model_patched.load_state_dict(model_orig.state_dict())

        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model_patched)

        out_patched = model_patched(input_ids, labels=input_ids)
        out_patched.loss.backward()
        grads_patched = {}
        for n, p in model_patched.named_parameters():
            if p.grad is None:
                continue
            g = p.grad.float().clone()
            # gate_up_proj grads are in interleaved layout, de-interleave to match orig
            if "gate_up_proj" in n:
                g = deinterleave_gate_up(g)
            grads_patched[n] = g
        loss_patched = out_patched.loss.item()

        # ---------- Compare ----------
        assert abs(loss_orig - loss_patched) < 0.5, (
            f"Loss mismatch: orig={loss_orig:.4f}, patched={loss_patched:.4f}"
        )

        # All parameters with gradients in original should have them in patched
        missing = set(grads_orig.keys()) - set(grads_patched.keys())
        assert not missing, f"Missing gradients in patched model: {missing}"

        # Compare gradient values
        # bf16 with different GEMM impls (cuBLAS vs CUTLASS) can diverge,
        # so use generous tolerance: flag only if both rel >10% AND abs >1e-2
        mismatches = []
        for name in grads_orig:
            if name not in grads_patched:
                continue
            g_orig = grads_orig[name]
            g_patched = grads_patched[name]
            max_diff = (g_orig - g_patched).abs().max().item()
            rel_diff = max_diff / (g_orig.abs().max().item() + 1e-8)

            if rel_diff > 0.1 and max_diff > 1e-2:
                mismatches.append(
                    f"  {name}: max_abs_diff={max_diff:.6f}, rel_diff={rel_diff:.4f}"
                )

        assert not mismatches, (
            "Gradient mismatches (rel_diff > 10% and abs_diff > 1e-2):\n"
            + "\n".join(mismatches)
        )

    def test_router_weights_receive_gradients(self):
        """Verify that router (gate) weights get non-zero gradients."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")

        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model)

        out = model(input_ids, labels=input_ids)
        out.loss.backward()

        gate_grads_found = False
        for name, param in model.named_parameters():
            if "gate" in name and "weight" in name:
                gate_grads_found = True
                assert param.grad is not None, f"No gradient for router: {name}"
                assert param.grad.abs().max() > 0, f"Zero gradient for router: {name}"

        assert gate_grads_found, "No gate.weight parameters found in model"


class TestSonicMoETrainingConvergence:
    """Verify loss decreases during training with SonicMoE."""

    def teardown_method(self):
        _unpatch_sonicmoe()

    def test_loss_decreases(self):
        """Run 30 training steps, verify loss decreases and no NaN/Inf."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")

        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model)

        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
        losses = []

        for step in range(30):
            out = model(input_ids, labels=input_ids)
            loss = out.loss
            assert not math.isnan(loss.item()), f"NaN loss at step {step}"
            assert not math.isinf(loss.item()), f"Inf loss at step {step}"
            losses.append(loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        assert losses[-1] < losses[0], (
            f"Loss did not decrease: first={losses[0]:.4f}, last={losses[-1]:.4f}"
        )

    def test_expert_weights_update(self):
        """Verify expert weights change during training (not frozen)."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")

        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model)

        # Snapshot expert weights before training
        expert_weights_before = {}
        for name, param in model.named_parameters():
            if "experts" in name:
                expert_weights_before[name] = param.data.clone()

        assert expert_weights_before, "No expert parameters found"

        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
        for _ in range(5):
            out = model(input_ids, labels=input_ids)
            out.loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Check that expert weights changed
        changed = 0
        for name, param in model.named_parameters():
            if name in expert_weights_before:
                if not torch.equal(param.data, expert_weights_before[name]):
                    changed += 1

        assert changed > 0, "No expert weights changed after 5 training steps"