fix: attempt disable async load

2026-02-25 17:30:32 +07:00
parent d3d6cb6b67
commit 1558436c69
1 changed files with 9 additions and 0 deletions
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -53,9 +53,18 @@ def patch_moe_quantization_on_load(cfg):
        LOG.debug("MoE loading-time quantization patch already active")
        return
    import os
    import transformers.core_model_loading
    from bitsandbytes.nn.parametrize import replace_parameter_4bit
    # Disable transformers' async weight loading thread pool. Without this,
    # the ThreadPoolExecutor pre-fetches tensors to CUDA faster than the main
    # loop can quantize them, causing all expert weights to accumulate in bf16
    # on GPU — defeating the purpose of loading-time quantization.
    os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
    LOG.info("Disabled async weight loading (HF_DEACTIVATE_ASYNC_LOAD=1)")
    # Read quantization settings from config
    quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4"
    compress_statistics = getattr(cfg, "bnb_4bit_use_double_quant", None)