fix: clear cache per param quant

2026-02-25 16:38:42 +07:00
parent ad4e1a5a91
commit 593599a217
1 changed files with 3 additions and 1 deletions
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -117,8 +117,10 @@ def quantize_moe_expert_params(model, quant_type=None, compress_statistics=None)
            quant_type=quant_type,
        )
        count += 1
+        # Free the bf16 → 4-bit conversion buffers after each parameter
+        # to avoid accumulating peak reserved VRAM.
+        torch.cuda.empty_cache()

-    torch.cuda.empty_cache()
    LOG.info(
        "Quantized %d MoE expert parameters to 4-bit (quant_type=%s, compress_statistics=%s)",
        count,