qwen3.5 configs (#3554) [skip ci]

* qwen3.5  configs

* update shared experts readme
This commit is contained in:
VED
2026-04-01 18:49:31 +05:30
committed by GitHub
parent 5e5603c9aa
commit 9e64c76326
5 changed files with 30 additions and 20 deletions

View File

@@ -31,10 +31,11 @@ lora_target_modules:
- k_proj
- v_proj
- o_proj
# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
# Add gate_up_proj and down_proj to also target shared experts (nn.Linear):
# - gate_up_proj
# - down_proj
# Target experts
# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters):
# lora_target_parameters:
# - mlp.experts.gate_up_proj
# - mlp.experts.down_proj

View File

@@ -31,11 +31,11 @@ lora_target_modules:
- k_proj
- v_proj
- o_proj
# Add gate_up_proj and down_proj to also target shared experts (nn.Linear):
# - gate_up_proj
# - down_proj
# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
# Target experts
# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters):
# lora_target_parameters:
# - mlp.experts.gate_up_proj
# - mlp.experts.down_proj

View File

@@ -31,11 +31,11 @@ lora_target_modules:
- k_proj
- v_proj
- o_proj
# Add gate_up_proj and down_proj to also target shared experts (nn.Linear):
# - gate_up_proj
# - down_proj
# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
# Target experts
# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters):
# lora_target_parameters:
# - mlp.experts.gate_up_proj
# - mlp.experts.down_proj

View File

@@ -42,14 +42,14 @@ lora_target_modules:
- k_proj
- v_proj
- o_proj
# Add gate_up_proj and down_proj to also target shared experts (nn.Linear):
# - gate_up_proj
# - down_proj
# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
# Target experts
lora_target_parameters:
- mlp.experts.gate_up_proj
- mlp.experts.down_proj
# Target routed experts (3D nn.Parameter tensors, not nn.Linear — use lora_target_parameters):
# lora_target_parameters:
# - mlp.experts.gate_up_proj
# - mlp.experts.down_proj
lora_qkv_kernel: true
lora_o_kernel: true

View File

@@ -59,12 +59,21 @@ lora_target_parameters:
### Shared Experts (MoE)
Routed experts and shared experts both have `gate_up_proj`/`down_proj`, so a plain module name in `lora_target_modules` would match both. Use a regex to target only attention and shared expert projections, while `lora_target_parameters` above handles routed experts separately:
Shared experts use `nn.Linear` (unlike routed experts which are 3D `nn.Parameter` tensors), so they can be targeted via `lora_target_modules`. To also train shared expert projections alongside attention, uncomment `gate_up_proj` and `down_proj` in `lora_target_modules`:
```yaml
lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
lora_target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
# Add gate_up_proj and down_proj to also target shared experts (nn.Linear):
# - gate_up_proj
# - down_proj
```
Use `lora_target_parameters` (see [Routed Experts](#routed-experts-moe) above) to target routed experts separately.
### TIPS
- For inference hyp, please see the respective model card details.