feat: add moe quant to test by ved

2026-02-25 15:41:41 +07:00
parent 91dae42737
commit 2fc60b9021
2 changed files with 98 additions and 0 deletions
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -173,6 +173,14 @@ class ModelLoader:
        PLUGIN_MANAGER.pre_model_load(self.cfg)
        self.patch_manager.apply_post_plugin_pre_model_load_patches()
        skip_move_to_device = self._build_model()
        # Quantize 3D MoE expert nn.Parameter tensors that BnB skips during loading.
        self.model._moe_experts_quantized = False
        if self.cfg.adapter in ("qlora", "lora") and self.cfg.load_in_4bit:
            from axolotl.monkeypatch.moe_quant import quantize_moe_expert_params
            self.model._moe_experts_quantized = quantize_moe_expert_params(self.model)
        PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
        # Post-build model configuration
@@ -860,6 +868,10 @@ class ModelLoader:
            # Make sure everything is in the same dtype
            skip_prepare_model_for_kbit_training = True
        if getattr(self.model, "_moe_experts_quantized", False):
            # Parametrized expert tensors dequantize on access — would OOM.
            skip_prepare_model_for_kbit_training = True
        if (
            not skip_prepare_model_for_kbit_training
            and self.cfg.adapter in ["lora", "qlora"]
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -0,0 +1,86 @@
 """
 Post-load quantization for MoE expert weights stored as 3D nn.Parameter tensors.
 In transformers v5, many MoE models store expert weights as fused 3D nn.Parameter
 tensors instead of individual nn.Linear modules. BnB 4-bit quantization only targets
 nn.Linear, so these expert weights are skipped during model loading, causing OOM.
 This module provides a post-load fixup that quantizes those skipped parameters using
 bitsandbytes.nn.parametrize.replace_parameter_4bit (requires bitsandbytes >= 0.48.0).
 PEFT's target_parameters / ParamWrapper can then apply LoRA on top of these quantized
 params via stacked parametrizations.
 """
 import bitsandbytes as bnb
 import torch
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 def find_unquantized_expert_params(model):
    """Find 3D+ nn.Parameter tensors that BnB quantization skipped.
    Returns:
        List of (module, param_name) tuples to quantize.
    """
    params_to_quantize = []
    for _, module in model.named_modules():
        if isinstance(module, (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt)):
            continue
        for param_name, param in module.named_parameters(recurse=False):
            if param.ndim >= 3 and any(
                kw in param_name for kw in ("experts", "gate_up_proj", "down_proj")
            ):
                params_to_quantize.append((module, param_name))
    return params_to_quantize
 def quantize_moe_expert_params(model, quant_type=None, compress_statistics=None):
    """Quantize 3D nn.Parameter expert weights that BnB skips during model loading.
    Reads quant_type and compress_statistics from the model's quantization_config
    when not explicitly provided, so that the same settings used for nn.Linear
    quantization are applied to the MoE expert parameters.
    """
    from bitsandbytes.nn.parametrize import replace_parameter_4bit
    params_to_quantize = find_unquantized_expert_params(model)
    if not params_to_quantize:
        return False
    # Derive settings from model's BnB config if not explicitly provided
    if quant_type is None or compress_statistics is None:
        bnb_config = getattr(model.config, "quantization_config", None)
        if bnb_config is not None:
            if quant_type is None:
                quant_type = getattr(bnb_config, "bnb_4bit_quant_type", "nf4")
            if compress_statistics is None:
                compress_statistics = getattr(
                    bnb_config, "bnb_4bit_use_double_quant", True
                )
    # Final defaults
    if quant_type is None:
        quant_type = "nf4"
    if compress_statistics is None:
        compress_statistics = True
    count = 0
    for module, param_name in params_to_quantize:
        replace_parameter_4bit(
            module,
            param_name,
            compress_statistics=compress_statistics,
            quant_type=quant_type,
        )
        count += 1
    torch.cuda.empty_cache()
    LOG.info(
        "Quantized %d MoE expert parameters to 4-bit (quant_type=%s, compress_statistics=%s)",
        count,
        quant_type,
        compress_statistics,
    )
    return True