patch

2025-09-15 23:15:20 -04:00
parent 7d572b58d1
commit de4344a56e
2 changed files with 99 additions and 0 deletions
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -12,6 +12,7 @@ import transformers
 from transformers import PretrainedConfig, PreTrainedModel
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.moe_grouped import apply_grouped_to_moe_blocks
 from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
    patch_for_multipack,
@@ -57,6 +58,8 @@ class PatchManager:
        self._apply_fsdp_patches()
        self._apply_adapter_patches()
        self._apply_model_specific_patches()
        # Apply MoE grouped GEMM patches (cfg.moe_backend)
        apply_grouped_to_moe_blocks(self.cfg)
        self._apply_fp8_patches()
        self._apply_flash_attention_peft_patches()
        self._apply_gradient_checkpointing_patches()
--- a/src/axolotl/monkeypatch/moe_grouped.py
+++ b/src/axolotl/monkeypatch/moe_grouped.py
@@ -0,0 +1,96 @@
 import warnings
 import torch
 from axolotl.common.architectures import MOE_ARCH_BLOCK
 from axolotl.kernels.moe.backends import MOEBackend, get_moe_backend_name
 def _patch_block_forward(block_cls, grouped_fn):
    """Replace block_cls.forward with grouped_fn preserving signature."""
    setattr(block_cls, "forward", grouped_fn)
 def apply_grouped_to_moe_blocks(cfg=None) -> None:
    """
    Attempt to patch all known MoE block classes to use the torch_grouped backend
    when cfg.moe_backend resolves to 'torch_grouped' and the op is available.
    Falls back to original forwards otherwise.
    """
    preferred = getattr(cfg, "moe_backend", None) if cfg is not None else None
    backend = get_moe_backend_name(preferred)
    if backend != MOEBackend.TORCH_GROUPED:
        return
    try:
        from axolotl.kernels.moe import torch_grouped as _tg
    except Exception:
        return
    if not _tg.available():
        warnings.warn("torch_grouped requested but unavailable; skipping MoE patches")
        return
    # Map of architecture key to (modeling module path, class name or list of class names)
    model_mods = {
        "mixtral": (
            "transformers.models.mixtral.modeling_mixtral",
            MOE_ARCH_BLOCK.get("mixtral"),
        ),
        "qwen2_moe": (
            "transformers.models.qwen2_moe.modeling_qwen2_moe",
            MOE_ARCH_BLOCK.get("qwen2_moe"),
        ),
        "qwen3_moe": (
            "transformers.models.qwen3_moe.modeling_qwen3_moe",
            MOE_ARCH_BLOCK.get("qwen3_moe"),
        ),
        "jamba": (
            "transformers.models.jamba.modeling_jamba",
            MOE_ARCH_BLOCK.get("jamba"),
        ),
        "deepseek_v2": (
            "transformers.models.deepseek_v2.modeling_deepseek_v2",
            MOE_ARCH_BLOCK.get("deepseek_v2"),
        ),
        # Others may not follow standard paths; best-effort import
        "dbrx": ("transformers.models.dbrx.modeling_dbrx", MOE_ARCH_BLOCK.get("dbrx")),
        "jetmoe": (
            "transformers.models.jetmoe.modeling_jetmoe",
            MOE_ARCH_BLOCK.get("jetmoe"),
        ),
        "gpt_oss": (
            "transformers.models.gpt_oss.modeling_gpt_oss",
            MOE_ARCH_BLOCK.get("gpt_oss"),
        ),
    }
    def make_grouped_forward(orig_forward):
        def _grouped_forward(self, hidden_states: torch.Tensor):
            bsz, seqlen, hdim = hidden_states.shape
            y, router_logits = _tg.moe_ffn_forward_grouped(
                hidden_states, self.gate, self.experts, self.top_k
            )
            if y is None:
                return orig_forward(self, hidden_states)
            return y, router_logits
        return _grouped_forward
    for key, (mod_path, cls_names) in model_mods.items():
        if not cls_names:
            continue
        try:
            import importlib
            modeling = importlib.import_module(mod_path)
            names = cls_names if isinstance(cls_names, list) else [cls_names]
            for name in names:
                if not hasattr(modeling, name):
                    continue
                block_cls = getattr(modeling, name)
                orig_forward = getattr(block_cls, "forward", None)
                if orig_forward is None:
                    continue
                _patch_block_forward(block_cls, make_grouped_forward(orig_forward))
        except Exception as e:
            # Best effort; log and skip this entry
            warnings.warn(f"Skipping MoE patch for {key}: {e}")