fix(validation): add validation for lora target linear with quantize experts (#3461)
* fix: add validation for lora target linear with quantize experts * chore: fix lint * chore: comment * fix: missing link on readme
This commit is contained in:
@@ -30,7 +30,7 @@
|
||||
## 🎉 Latest Updates
|
||||
|
||||
- 2026/03:
|
||||
- New model support has been added in Axolotl for Qwen3.5, Qwen3.5 MoE, [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
|
||||
- New model support has been added in Axolotl for [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
|
||||
- [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
|
||||
- 2026/02:
|
||||
- [ScatterMoE LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3410) support. LoRA fine-tuning directly on MoE expert weights using custom Triton kernels.
|
||||
|
||||
@@ -45,6 +45,7 @@ lora_target_parameters:
|
||||
|
||||
## Limitations
|
||||
|
||||
- `lora_target_linear` is not compatible with `quantize_moe_experts`. See [Expert LoRA targeting](#expert-lora-targeting) instead.
|
||||
- `cpu_ram_efficient_loading` hangs / takes long time with FSDP2 + QLoRA.
|
||||
- Total model parameter count may display incorrectly (trainable param count is correct).
|
||||
- FSDP LoRA (8-bit) may have a large initial VRAM spike at the first 1-2 steps, which then drops. QLoRA does not exhibit this.
|
||||
|
||||
@@ -1302,6 +1302,11 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
|
||||
@classmethod
|
||||
def check_quantize_moe_experts(cls, data):
|
||||
if data.get("quantize_moe_experts"):
|
||||
if data.get("lora_target_linear"):
|
||||
raise ValueError(
|
||||
"lora_target_linear is not compatible with quantize_moe_experts. "
|
||||
"Use lora_target_parameters to target expert weights instead."
|
||||
)
|
||||
if data.get("adapter") not in ("lora", "qlora"):
|
||||
raise ValueError("quantize_moe_experts requires adapter: lora or qlora")
|
||||
if not (data.get("load_in_4bit") or data.get("load_in_8bit")):
|
||||
|
||||
@@ -79,6 +79,20 @@ class TestQuantizeMoeExpertsValidation:
|
||||
result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)
|
||||
assert result["quantize_moe_experts"] is False
|
||||
|
||||
def test_rejects_lora_target_linear(self, min_base_cfg, gpu_caps, env_caps):
|
||||
"""quantize_moe_experts with lora_target_linear should fail."""
|
||||
cfg = (
|
||||
DictDefault(
|
||||
quantize_moe_experts=True,
|
||||
adapter="qlora",
|
||||
load_in_4bit=True,
|
||||
lora_target_linear=True,
|
||||
)
|
||||
| min_base_cfg
|
||||
)
|
||||
with pytest.raises(ValueError, match="lora_target_linear is not compatible"):
|
||||
validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)
|
||||
|
||||
def test_default_is_false(self, min_base_cfg, gpu_caps, env_caps):
|
||||
"""quantize_moe_experts should default to false."""
|
||||
cfg = DictDefault({}) | min_base_cfg
|
||||
|
||||
Reference in New Issue
Block a user