diff --git a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml index 8548a04e1..f66bcd370 100644 --- a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml +++ b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml @@ -31,10 +31,11 @@ lora_target_modules: - k_proj - v_proj - o_proj -# Regex matching to target shared experts too -# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' + # Add gate_up_proj and down_proj to also target shared experts (nn.Linear): + # - gate_up_proj + # - down_proj -# Target experts +# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters): # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj diff --git a/examples/qwen3.5/122b-a10b-moe-qlora.yaml b/examples/qwen3.5/122b-a10b-moe-qlora.yaml index 4d805c004..4447cf73c 100644 --- a/examples/qwen3.5/122b-a10b-moe-qlora.yaml +++ b/examples/qwen3.5/122b-a10b-moe-qlora.yaml @@ -31,11 +31,11 @@ lora_target_modules: - k_proj - v_proj - o_proj + # Add gate_up_proj and down_proj to also target shared experts (nn.Linear): + # - gate_up_proj + # - down_proj -# Regex matching to target shared experts too -# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' - -# Target experts +# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters): # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj diff --git a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml index fd4adbe05..ad17366cb 100644 --- a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml +++ b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml @@ -31,11 +31,11 @@ lora_target_modules: - k_proj - v_proj - o_proj + # Add gate_up_proj and down_proj to also target shared experts (nn.Linear): + # - gate_up_proj + # - down_proj -# Regex matching to target shared experts too -# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' - -# Target experts +# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters): # lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj diff --git a/examples/qwen3.5/35b-a3b-moe-qlora.yaml b/examples/qwen3.5/35b-a3b-moe-qlora.yaml index 14b50703e..22468a178 100644 --- a/examples/qwen3.5/35b-a3b-moe-qlora.yaml +++ b/examples/qwen3.5/35b-a3b-moe-qlora.yaml @@ -42,14 +42,14 @@ lora_target_modules: - k_proj - v_proj - o_proj + # Add gate_up_proj and down_proj to also target shared experts (nn.Linear): + # - gate_up_proj + # - down_proj -# Regex matching to target shared experts too -# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' - -# Target experts -lora_target_parameters: - - mlp.experts.gate_up_proj - - mlp.experts.down_proj +# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters): +# lora_target_parameters: +# - mlp.experts.gate_up_proj +# - mlp.experts.down_proj lora_qkv_kernel: true lora_o_kernel: true diff --git a/examples/qwen3.5/README.md b/examples/qwen3.5/README.md index 6c0d01969..b5089d727 100644 --- a/examples/qwen3.5/README.md +++ b/examples/qwen3.5/README.md @@ -59,12 +59,21 @@ lora_target_parameters: ### Shared Experts (MoE) -Routed experts and shared experts both have `gate_up_proj`/`down_proj`, so a plain module name in `lora_target_modules` would match both. Use a regex to target only attention and shared expert projections, while `lora_target_parameters` above handles routed experts separately: +Shared experts use `nn.Linear` (unlike routed experts which are 3D `nn.Parameter` tensors), so they can be targeted via `lora_target_modules`. To also train shared expert projections alongside attention, uncomment `gate_up_proj` and `down_proj` in `lora_target_modules`: ```yaml -lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + # Add gate_up_proj and down_proj to also target shared experts (nn.Linear): + # - gate_up_proj + # - down_proj ``` +Use `lora_target_parameters` (see [Routed Experts](#routed-experts-moe) above) to target routed experts separately. + ### TIPS - For inference hyp, please see the respective model card details.