diff --git a/scripts/benchmarks/deepseek_v3_moe_sweep.py b/scripts/benchmarks/deepseek_v3_moe_sweep.py index c0885152f..a52a4bb64 100644 --- a/scripts/benchmarks/deepseek_v3_moe_sweep.py +++ b/scripts/benchmarks/deepseek_v3_moe_sweep.py @@ -100,18 +100,6 @@ ARCHETYPES = ( }, [(4, 2048), (8, 4096)], ), - ( - "dbrx", - { - "hidden_size": 6144, - "moe_intermediate_size": 24576, - "n_experts": 16, - "top_k": 2, - "groups": 4, - "group_size": 192, - }, - [(4, 4096), (8, 8192)], - ), ( "qwen", { diff --git a/src/axolotl/monkeypatch/deepseek_v3/__init__.py b/src/axolotl/monkeypatch/deepseek_v3/__init__.py index 6d1e97d65..95c1b1653 100644 --- a/src/axolotl/monkeypatch/deepseek_v3/__init__.py +++ b/src/axolotl/monkeypatch/deepseek_v3/__init__.py @@ -375,6 +375,13 @@ def patch_deepseek_v3_moe( def patched_moe(self, hidden_states, topk_indices, topk_weights): backend_sel = getattr(self, "_axolotl_triton_backend", backend) group_size_sel = getattr(self, "_axolotl_group_size_m", group_size_m) + if backend_sel == "cg" and group_size_sel != _GROUP_SIZE_M: + LOG.debug( + "Adjusting group_size_m=%s to %s for CG backend", + group_size_sel, + _GROUP_SIZE_M, + ) + group_size_sel = _GROUP_SIZE_M try: return _moe_triton_forward( self,