fix: attempt disable async load
This commit is contained in:
@@ -53,9 +53,18 @@ def patch_moe_quantization_on_load(cfg):
|
|||||||
LOG.debug("MoE loading-time quantization patch already active")
|
LOG.debug("MoE loading-time quantization patch already active")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
import transformers.core_model_loading
|
import transformers.core_model_loading
|
||||||
from bitsandbytes.nn.parametrize import replace_parameter_4bit
|
from bitsandbytes.nn.parametrize import replace_parameter_4bit
|
||||||
|
|
||||||
|
# Disable transformers' async weight loading thread pool. Without this,
|
||||||
|
# the ThreadPoolExecutor pre-fetches tensors to CUDA faster than the main
|
||||||
|
# loop can quantize them, causing all expert weights to accumulate in bf16
|
||||||
|
# on GPU — defeating the purpose of loading-time quantization.
|
||||||
|
os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
|
||||||
|
LOG.info("Disabled async weight loading (HF_DEACTIVATE_ASYNC_LOAD=1)")
|
||||||
|
|
||||||
# Read quantization settings from config
|
# Read quantization settings from config
|
||||||
quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4"
|
quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4"
|
||||||
compress_statistics = getattr(cfg, "bnb_4bit_use_double_quant", None)
|
compress_statistics = getattr(cfg, "bnb_4bit_use_double_quant", None)
|
||||||
|
|||||||
Reference in New Issue
Block a user