From 593599a217583667cfe143f35c8064e0e0cabe1d Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 25 Feb 2026 16:38:42 +0700
Subject: [PATCH] fix: clear cache per param quant

---
 src/axolotl/monkeypatch/moe_quant.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/monkeypatch/moe_quant.py b/src/axolotl/monkeypatch/moe_quant.py
index 702a5004e..be3b3e1e1 100644
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -117,8 +117,10 @@ def quantize_moe_expert_params(model, quant_type=None, compress_statistics=None)
             quant_type=quant_type,
         )
         count += 1
+        # Free the bf16 → 4-bit conversion buffers after each parameter
+        # to avoid accumulating peak reserved VRAM.
+        torch.cuda.empty_cache()
 
-    torch.cuda.empty_cache()
     LOG.info(
         "Quantized %d MoE expert parameters to 4-bit (quant_type=%s, compress_statistics=%s)",
         count,