From 593599a217583667cfe143f35c8064e0e0cabe1d Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Wed, 25 Feb 2026 16:38:42 +0700 Subject: [PATCH] fix: clear cache per param quant --- src/axolotl/monkeypatch/moe_quant.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/axolotl/monkeypatch/moe_quant.py b/src/axolotl/monkeypatch/moe_quant.py index 702a5004e..be3b3e1e1 100644 --- a/src/axolotl/monkeypatch/moe_quant.py +++ b/src/axolotl/monkeypatch/moe_quant.py @@ -117,8 +117,10 @@ def quantize_moe_expert_params(model, quant_type=None, compress_statistics=None) quant_type=quant_type, ) count += 1 + # Free the bf16 → 4-bit conversion buffers after each parameter + # to avoid accumulating peak reserved VRAM. + torch.cuda.empty_cache() - torch.cuda.empty_cache() LOG.info( "Quantized %d MoE expert parameters to 4-bit (quant_type=%s, compress_statistics=%s)", count,