fix: clear cache per param quant

This commit is contained in:
NanoCode012
2026-02-25 16:38:42 +07:00
parent ad4e1a5a91
commit 593599a217

View File

@@ -117,8 +117,10 @@ def quantize_moe_expert_params(model, quant_type=None, compress_statistics=None)
quant_type=quant_type,
)
count += 1
# Free the bf16 → 4-bit conversion buffers after each parameter
# to avoid accumulating peak reserved VRAM.
torch.cuda.empty_cache()
torch.cuda.empty_cache()
LOG.info(
"Quantized %d MoE expert parameters to 4-bit (quant_type=%s, compress_statistics=%s)",
count,