From 731d5dd19304e8624f3d3e2ae6bcbd6ef1c7bcbc Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 25 Feb 2026 18:05:15 +0700
Subject: [PATCH] chore: remove leftover logs

---
 src/axolotl/monkeypatch/moe_quant.py | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/src/axolotl/monkeypatch/moe_quant.py b/src/axolotl/monkeypatch/moe_quant.py
index 70fb0726d..a22d5b136 100644
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -63,15 +63,10 @@ def patch_moe_quantization_on_load(cfg):
     # size (BnB doesn't know we'll quantize them), causing a ~50+ GiB reservation
     # that defeats loading-time quantization. Disabling it trades slightly slower
     # weight loading for dramatically lower peak VRAM.
-    _original_warmup = transformers.modeling_utils.caching_allocator_warmup
-
     def _noop_warmup(*args, **kwargs):
-        LOG.info(
-            "Skipped caching_allocator_warmup (MoE loading-time quantization active)"
-        )
+        pass
 
     transformers.modeling_utils.caching_allocator_warmup = _noop_warmup
-    LOG.info("Patched caching_allocator_warmup to no-op for MoE quantization")
 
     # Read quantization settings from config
     quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4"
@@ -101,24 +96,9 @@ def patch_moe_quantization_on_load(cfg):
                 )
                 torch.cuda.empty_cache()
                 _moe_load_state["count"] += 1
-                if _moe_load_state["count"] % 10 == 1:
-                    LOG.info(
-                        "Quantized expert param #%d: %s "
-                        "(alloc=%.2f GiB, reserved=%.2f GiB)",
-                        _moe_load_state["count"],
-                        target_name,
-                        torch.cuda.memory_allocated() / 1024**3,
-                        torch.cuda.memory_reserved() / 1024**3,
-                    )
 
     transformers.core_model_loading.set_param_for_module = _patched_set_param_for_module
     _moe_load_state["patched"] = True
-    LOG.info(
-        "Activated MoE loading-time quantization patch "
-        "(quant_type=%s, compress_statistics=%s)",
-        quant_type,
-        compress_statistics,
-    )
 
 
 def get_moe_quantized_count():