diff --git a/README.md b/README.md index 9c7a8a493..594b06156 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ ## 🎉 Latest Updates - 2026/03: - - New model support has been added in Axolotl for Qwen3.5, Qwen3.5 MoE, [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45). + - New model support has been added in Axolotl for [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45). - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat). - 2026/02: - [ScatterMoE LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3410) support. LoRA fine-tuning directly on MoE expert weights using custom Triton kernels. diff --git a/docs/expert_quantization.qmd b/docs/expert_quantization.qmd index 7271e8864..7eabed1cf 100644 --- a/docs/expert_quantization.qmd +++ b/docs/expert_quantization.qmd @@ -45,6 +45,7 @@ lora_target_parameters: ## Limitations +- `lora_target_linear` is not compatible with `quantize_moe_experts`. See [Expert LoRA targeting](#expert-lora-targeting) instead. - `cpu_ram_efficient_loading` hangs / takes long time with FSDP2 + QLoRA. - Total model parameter count may display incorrectly (trainable param count is correct). - FSDP LoRA (8-bit) may have a large initial VRAM spike at the first 1-2 steps, which then drops. QLoRA does not exhibit this. diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py index 8d53cec52..5ea340c37 100644 --- a/src/axolotl/utils/schemas/config.py +++ b/src/axolotl/utils/schemas/config.py @@ -1302,6 +1302,11 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig): @classmethod def check_quantize_moe_experts(cls, data): if data.get("quantize_moe_experts"): + if data.get("lora_target_linear"): + raise ValueError( + "lora_target_linear is not compatible with quantize_moe_experts. " + "Use lora_target_parameters to target expert weights instead." + ) if data.get("adapter") not in ("lora", "qlora"): raise ValueError("quantize_moe_experts requires adapter: lora or qlora") if not (data.get("load_in_4bit") or data.get("load_in_8bit")): diff --git a/tests/utils/schemas/validation/test_moe_quant.py b/tests/utils/schemas/validation/test_moe_quant.py index b969cbb68..a2121473a 100644 --- a/tests/utils/schemas/validation/test_moe_quant.py +++ b/tests/utils/schemas/validation/test_moe_quant.py @@ -79,6 +79,20 @@ class TestQuantizeMoeExpertsValidation: result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) assert result["quantize_moe_experts"] is False + def test_rejects_lora_target_linear(self, min_base_cfg, gpu_caps, env_caps): + """quantize_moe_experts with lora_target_linear should fail.""" + cfg = ( + DictDefault( + quantize_moe_experts=True, + adapter="qlora", + load_in_4bit=True, + lora_target_linear=True, + ) + | min_base_cfg + ) + with pytest.raises(ValueError, match="lora_target_linear is not compatible"): + validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps) + def test_default_is_false(self, min_base_cfg, gpu_caps, env_caps): """quantize_moe_experts should default to false.""" cfg = DictDefault({}) | min_base_cfg