fix: attempt disable async load

2026-02-25 17:30:32 +07:00
parent d3d6cb6b67
commit 1558436c69
1 changed files with 9 additions and 0 deletions
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -53,9 +53,18 @@ def patch_moe_quantization_on_load(cfg):
        LOG.debug("MoE loading-time quantization patch already active")
        return

+    import os
+
    import transformers.core_model_loading
    from bitsandbytes.nn.parametrize import replace_parameter_4bit

+    # Disable transformers' async weight loading thread pool. Without this,
+    # the ThreadPoolExecutor pre-fetches tensors to CUDA faster than the main
+    # loop can quantize them, causing all expert weights to accumulate in bf16
+    # on GPU — defeating the purpose of loading-time quantization.
+    os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
+    LOG.info("Disabled async weight loading (HF_DEACTIVATE_ASYNC_LOAD=1)")
+
    # Read quantization settings from config
    quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4"
    compress_statistics = getattr(cfg, "bnb_4bit_use_double_quant", None)