Qwen3.5-MoE example config with lora_target_modules regex (#3515) [skip ci]

* lora target modules with regex * updates * fsdp for non moe * update wording * chore: cleanup and lint * chore: cleanup docs from merge --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>
2026-03-20 02:52:46 -07:00
parent 038ffe3f26
commit c57acef2c7
8 changed files with 309 additions and 44 deletions
--- a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
+++ b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
@@ -0,0 +1,84 @@
 base_model: Qwen/Qwen3.5-122B-A10B
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 strict: false
 chat_template: qwen3_5
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 dataset_prepared_path: last_run_prepared
 sequence_len: 2048
 sample_packing: true
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0
 lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
 # Regex matching to target shared experts too
 # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
 # Target experts
 # lora_target_parameters:
 #   - mlp.experts.gate_up_proj
 #   - mlp.experts.down_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 lora_mlp_kernel: false
 lora_qkv_kernel: false
 lora_o_kernel: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 fsdp_config:
  fsdp_version: 2
  offload_params: true
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true
--- a/examples/qwen3.5/122b-a10b-moe-qlora.yaml
+++ b/examples/qwen3.5/122b-a10b-moe-qlora.yaml
@@ -32,7 +32,11 @@ lora_target_modules:
  - v_proj
  - o_proj
-#lora_target_parameters:
+# Regex matching to target shared experts too
 # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
 # Target experts
 # lora_target_parameters:
 #   - mlp.experts.gate_up_proj
 #   - mlp.experts.down_proj
@@ -52,7 +56,6 @@ learning_rate: 0.0002
 bf16: auto
 tf32: true
 lora_mlp_kernel: false
 lora_qkv_kernel: false
 lora_o_kernel: false
--- a/examples/qwen3.5/27b-qlora-fsdp.yaml
+++ b/examples/qwen3.5/27b-qlora-fsdp.yaml
@@ -0,0 +1,81 @@
 base_model: Qwen/Qwen3.5-27B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 strict: false
 chat_template: qwen3_5
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 dataset_prepared_path: last_run_prepared
 sequence_len: 2048
 sample_packing: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 16
 lora_alpha: 32
 lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - down_proj
  - up_proj
  # Uncomment below to also target the linear attention projections.
  # These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific).
  # - linear_attn.in_proj_qkv
  # - linear_attn.in_proj_z
  # - linear_attn.out_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 fsdp_config:
  fsdp_version: 2
  offload_params: false
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3_5DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true
--- a/examples/qwen3.5/27b-qlora.yaml
+++ b/examples/qwen3.5/27b-qlora.yaml
@@ -1,9 +1,7 @@
 base_model: Qwen/Qwen3.5-27B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # Note: Qwen3.5 is an early-fusion VLM (image+text). This config fine-tunes
 # the text-only path. For multimodal (image+text) fine-tuning, add image
 # columns to your dataset following axolotl's multimodal dataset format.
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
--- a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
@@ -0,0 +1,85 @@
 base_model: Qwen/Qwen3.5-35B-A3B
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 strict: false
 chat_template: qwen3_5
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 dataset_prepared_path: last_run_prepared
 sequence_len: 2048
 sample_packing: true
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0
 lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
 # Regex matching to target shared experts too
 # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
 # Target experts
 # lora_target_parameters:
 #   - mlp.experts.gate_up_proj
 #   - mlp.experts.down_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 lora_mlp_kernel: false
 lora_qkv_kernel: false
 lora_o_kernel: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 fsdp_config:
  fsdp_version: 2
  offload_params: true
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true
--- a/examples/qwen3.5/35b-a3b-moe-qlora.yaml
+++ b/examples/qwen3.5/35b-a3b-moe-qlora.yaml
@@ -32,7 +32,11 @@ lora_target_modules:
  - v_proj
  - o_proj
-#lora_target_parameters:
+# Regex matching to target shared experts too
 # lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
 # Target experts
 # lora_target_parameters:
 #   - mlp.experts.gate_up_proj
 #   - mlp.experts.down_proj
--- a/examples/qwen3.5/9b-lora-vision.yaml
+++ b/examples/qwen3.5/9b-lora-vision.yaml
@@ -26,8 +26,6 @@ lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 # Targets the language model attention and MLP layers.
 # Qwen3.5 is early-fusion: all layers (including those seeing vision tokens) share
 # the same transformer stack, so standard attention targets work for both modalities.
 lora_target_modules:
  - q_proj
  - k_proj
--- a/examples/qwen3.5/README.md
+++ b/examples/qwen3.5/README.md
@@ -2,20 +2,6 @@
 [Qwen3.5](https://huggingface.co/collections/Qwen/qwen35) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. All Qwen3.5 models are early-fusion vision-language models: dense variants use `Qwen3_5ForConditionalGeneration` and MoE variants use `Qwen3_5MoeForConditionalGeneration`.
 Vision and text tokens are processed through the same transformer stack. The configs below train on text-only data unless noted otherwise. See `9b-lora-vision.yaml` for a multimodal example.
 Available configs:
 | Config | Model | Type | Peak VRAM |
 |---|---|---|---|
 | `27b-qlora.yaml` | Qwen3.5-27B | Dense VLM, text-only QLoRA | ~47 GiB |
 | `27b-fft.yaml` | Qwen3.5-27B | Dense VLM, text-only FFT (vision frozen) | ~53 GiB |
 | `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — |
 | `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — |
 | `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — |
 | `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB |
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
@@ -23,43 +9,69 @@ Available configs:
 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
 3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
-```bash
+  ```bash
-pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
+  pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
  ```
  > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.
 4. Pick any config from the table below and run:
    ```bash
    axolotl train examples/qwen3.5/<config>.yaml
    ```
 Available configs:
 | Config | Model | Type | Peak VRAM |
 |---|---|---|---|
 | `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — |
 | `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB |
 | `27b-qlora.yaml` | Qwen3.5-27B | Dense, text-only QLoRA | ~47 GiB |
 | `27b-fft.yaml` | Qwen3.5-27B | Dense, text-only FFT (vision frozen) | ~53 GiB |
 | `27b-qlora-fsdp.yaml` | Qwen3.5-27B | Dense, text-only QLoRA + FSDP2 | — |
 | `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — |
 | `35b-a3b-moe-qlora-fsdp.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA + FSDP2 | — |
 | `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — |
 | `122b-a10b-moe-qlora-fsdp.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA + FSDP2 | — |
 ### Gated DeltaNet Linear Attention
 Qwen3.5 interleaves standard attention with Gated DeltaNet linear attention layers. To apply LoRA to them, add to `lora_target_modules`:
 ```yaml
 lora_target_modules:
  # ... standard projections ...
  - linear_attn.in_proj_qkv
  - linear_attn.in_proj_z
  - linear_attn.out_proj
 ```
 > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.
-4. Run a finetuning example:
+### Routed Experts (MoE)
-```bash
+To apply LoRA to routed expert parameters, add `lora_target_parameters`:
 # Dense 27B text-only (QLoRA, ~47 GiB VRAM with sample packing)
 axolotl train examples/qwen3.5/27b-qlora.yaml
-# Dense 27B text-only FFT with vision encoder frozen (~53 GiB, single 80 GiB GPU)
+```yaml
-axolotl train examples/qwen3.5/27b-fft.yaml
+lora_target_parameters:
  - mlp.experts.gate_up_proj
  - mlp.experts.down_proj
 #  - mlp.gate.weight  # router
 ```
-# MoE 35B-A3B text-only (QLoRA)
+### Shared Experts (MoE)
 axolotl train examples/qwen3.5/35b-a3b-moe-qlora.yaml
-# MoE 122B-A10B text-only (QLoRA)
+Routed experts and shared experts both have `gate_up_proj`/`down_proj`, so a plain module name in `lora_target_modules` would match both. Use a regex to target only attention and shared expert projections, while `lora_target_parameters` above handles routed experts separately:
 axolotl train examples/qwen3.5/122b-a10b-moe-qlora.yaml
 # 9B vision+text (LoRA, multimodal dataset)
 axolotl train examples/qwen3.5/9b-lora-vision.yaml
 # 9B vision+text FFT, single 80 GiB GPU (~61 GiB peak)
 axolotl train examples/qwen3.5/9b-fft-vision.yaml
 ```yaml
 lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
 ```
 ### TIPS
- For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`.
+- For inference hyp, please see the respective model card details.
 - For **text-only FFT** on 27B, use `27b-fft.yaml` which sets `unfrozen_parameters` to freeze the vision encoder (`model.visual.*`) — this avoids wasting optimizer state on parameters that receive no gradient from text-only data.
 - You can run a full finetuning of smaller configs by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below.
 - Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 - For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `9b-lora-vision.yaml`.
 - The Gated DeltaNet linear attention layers (`linear_attn.*`) can optionally be added to `lora_target_modules` — they are commented out by default.
 ## Optimization Guides