From c57acef2c7d3efb04d7d80a2334895a5caef6f9a Mon Sep 17 00:00:00 2001 From: Owen Arliawan <37985114+Nero10578@users.noreply.github.com> Date: Fri, 20 Mar 2026 02:52:46 -0700 Subject: [PATCH] Qwen3.5-MoE example config with lora_target_modules regex (#3515) [skip ci] * lora target modules with regex * updates * fsdp for non moe * update wording * chore: cleanup and lint * chore: cleanup docs from merge --------- Co-authored-by: NanoCode012 --- .../qwen3.5/122b-a10b-moe-qlora-fsdp.yaml | 84 ++++++++++++++++++ examples/qwen3.5/122b-a10b-moe-qlora.yaml | 7 +- examples/qwen3.5/27b-qlora-fsdp.yaml | 81 ++++++++++++++++++ examples/qwen3.5/27b-qlora.yaml | 4 +- examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml | 85 +++++++++++++++++++ examples/qwen3.5/35b-a3b-moe-qlora.yaml | 6 +- examples/qwen3.5/9b-lora-vision.yaml | 2 - examples/qwen3.5/README.md | 84 ++++++++++-------- 8 files changed, 309 insertions(+), 44 deletions(-) create mode 100644 examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml create mode 100644 examples/qwen3.5/27b-qlora-fsdp.yaml create mode 100644 examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml diff --git a/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml new file mode 100644 index 000000000..8548a04e1 --- /dev/null +++ b/examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml @@ -0,0 +1,84 @@ +base_model: Qwen/Qwen3.5-122B-A10B + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin +strict: false + +chat_template: qwen3_5 +datasets: + - path: mlabonne/FineTome-100k + type: chat_template + split: train[:20%] + field_messages: conversations + message_property_mappings: + role: from + content: value +val_set_size: 0.0 +output_dir: ./outputs/out +dataset_prepared_path: last_run_prepared + +sequence_len: 2048 +sample_packing: true + +load_in_4bit: true +quantize_moe_experts: true +adapter: qlora +lora_r: 16 +lora_alpha: 32 +lora_dropout: 0 +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj +# Regex matching to target shared experts too +# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' + +# Target experts +# lora_target_parameters: +# - mlp.experts.gate_up_proj +# - mlp.experts.down_proj + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 2 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_4bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: true + +lora_mlp_kernel: false +lora_qkv_kernel: false +lora_o_kernel: false + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 4 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: + +fsdp_config: + fsdp_version: 2 + offload_params: true + cpu_ram_efficient_loading: false + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer + state_dict_type: FULL_STATE_DICT + sharding_strategy: FULL_SHARD + reshard_after_forward: true + activation_checkpointing: true diff --git a/examples/qwen3.5/122b-a10b-moe-qlora.yaml b/examples/qwen3.5/122b-a10b-moe-qlora.yaml index e9cbf80ce..4d805c004 100644 --- a/examples/qwen3.5/122b-a10b-moe-qlora.yaml +++ b/examples/qwen3.5/122b-a10b-moe-qlora.yaml @@ -32,7 +32,11 @@ lora_target_modules: - v_proj - o_proj -#lora_target_parameters: +# Regex matching to target shared experts too +# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' + +# Target experts +# lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj @@ -52,7 +56,6 @@ learning_rate: 0.0002 bf16: auto tf32: true - lora_mlp_kernel: false lora_qkv_kernel: false lora_o_kernel: false diff --git a/examples/qwen3.5/27b-qlora-fsdp.yaml b/examples/qwen3.5/27b-qlora-fsdp.yaml new file mode 100644 index 000000000..79b87a32f --- /dev/null +++ b/examples/qwen3.5/27b-qlora-fsdp.yaml @@ -0,0 +1,81 @@ +base_model: Qwen/Qwen3.5-27B + +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin +strict: false + +chat_template: qwen3_5 +datasets: + - path: mlabonne/FineTome-100k + type: chat_template + split: train[:20%] + field_messages: conversations + message_property_mappings: + role: from + content: value +val_set_size: 0.0 +output_dir: ./outputs/out +dataset_prepared_path: last_run_prepared + +sequence_len: 2048 +sample_packing: true + +load_in_4bit: true +adapter: qlora +lora_r: 16 +lora_alpha: 32 +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + - down_proj + - up_proj + # Uncomment below to also target the linear attention projections. + # These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific). + # - linear_attn.in_proj_qkv + # - linear_attn.in_proj_z + # - linear_attn.out_proj + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 2 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_4bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: true + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 4 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: + +fsdp_config: + fsdp_version: 2 + offload_params: false + cpu_ram_efficient_loading: false + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: Qwen3_5DecoderLayer + state_dict_type: FULL_STATE_DICT + sharding_strategy: FULL_SHARD + reshard_after_forward: true + activation_checkpointing: true diff --git a/examples/qwen3.5/27b-qlora.yaml b/examples/qwen3.5/27b-qlora.yaml index 2ba1c4ed7..18c0af95b 100644 --- a/examples/qwen3.5/27b-qlora.yaml +++ b/examples/qwen3.5/27b-qlora.yaml @@ -1,9 +1,7 @@ base_model: Qwen/Qwen3.5-27B + # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name -# Note: Qwen3.5 is an early-fusion VLM (image+text). This config fine-tunes -# the text-only path. For multimodal (image+text) fine-tuning, add image -# columns to your dataset following axolotl's multimodal dataset format. plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin diff --git a/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml new file mode 100644 index 000000000..fd4adbe05 --- /dev/null +++ b/examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml @@ -0,0 +1,85 @@ +base_model: Qwen/Qwen3.5-35B-A3B + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin +strict: false + +chat_template: qwen3_5 +datasets: + - path: mlabonne/FineTome-100k + type: chat_template + split: train[:20%] + field_messages: conversations + message_property_mappings: + role: from + content: value +val_set_size: 0.0 +output_dir: ./outputs/out +dataset_prepared_path: last_run_prepared + +sequence_len: 2048 +sample_packing: true + +load_in_4bit: true +quantize_moe_experts: true +adapter: qlora +lora_r: 16 +lora_alpha: 32 +lora_dropout: 0 +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + +# Regex matching to target shared experts too +# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' + +# Target experts +# lora_target_parameters: +# - mlp.experts.gate_up_proj +# - mlp.experts.down_proj + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 2 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_4bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: true + +lora_mlp_kernel: false +lora_qkv_kernel: false +lora_o_kernel: false + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 4 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: + +fsdp_config: + fsdp_version: 2 + offload_params: true + cpu_ram_efficient_loading: false + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer + state_dict_type: FULL_STATE_DICT + sharding_strategy: FULL_SHARD + reshard_after_forward: true + activation_checkpointing: true diff --git a/examples/qwen3.5/35b-a3b-moe-qlora.yaml b/examples/qwen3.5/35b-a3b-moe-qlora.yaml index 462babf0b..dea45801c 100644 --- a/examples/qwen3.5/35b-a3b-moe-qlora.yaml +++ b/examples/qwen3.5/35b-a3b-moe-qlora.yaml @@ -32,7 +32,11 @@ lora_target_modules: - v_proj - o_proj -#lora_target_parameters: +# Regex matching to target shared experts too +# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' + +# Target experts +# lora_target_parameters: # - mlp.experts.gate_up_proj # - mlp.experts.down_proj diff --git a/examples/qwen3.5/9b-lora-vision.yaml b/examples/qwen3.5/9b-lora-vision.yaml index 9fb222901..1c3717724 100644 --- a/examples/qwen3.5/9b-lora-vision.yaml +++ b/examples/qwen3.5/9b-lora-vision.yaml @@ -26,8 +26,6 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 # Targets the language model attention and MLP layers. -# Qwen3.5 is early-fusion: all layers (including those seeing vision tokens) share -# the same transformer stack, so standard attention targets work for both modalities. lora_target_modules: - q_proj - k_proj diff --git a/examples/qwen3.5/README.md b/examples/qwen3.5/README.md index 1b64bc78d..6c0d01969 100644 --- a/examples/qwen3.5/README.md +++ b/examples/qwen3.5/README.md @@ -2,20 +2,6 @@ [Qwen3.5](https://huggingface.co/collections/Qwen/qwen35) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. All Qwen3.5 models are early-fusion vision-language models: dense variants use `Qwen3_5ForConditionalGeneration` and MoE variants use `Qwen3_5MoeForConditionalGeneration`. -Vision and text tokens are processed through the same transformer stack. The configs below train on text-only data unless noted otherwise. See `9b-lora-vision.yaml` for a multimodal example. - -Available configs: - -| Config | Model | Type | Peak VRAM | -|---|---|---|---| -| `27b-qlora.yaml` | Qwen3.5-27B | Dense VLM, text-only QLoRA | ~47 GiB | -| `27b-fft.yaml` | Qwen3.5-27B | Dense VLM, text-only FFT (vision frozen) | ~53 GiB | -| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — | -| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — | -| `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — | -| `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB | - - ## Getting started 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). @@ -23,43 +9,69 @@ Available configs: 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. 3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers: -```bash -pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1 + ```bash + pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1 + ``` + > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there. + +4. Pick any config from the table below and run: + + ```bash + axolotl train examples/qwen3.5/.yaml + ``` + +Available configs: + +| Config | Model | Type | Peak VRAM | +|---|---|---|---| +| `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — | +| `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB | +| `27b-qlora.yaml` | Qwen3.5-27B | Dense, text-only QLoRA | ~47 GiB | +| `27b-fft.yaml` | Qwen3.5-27B | Dense, text-only FFT (vision frozen) | ~53 GiB | +| `27b-qlora-fsdp.yaml` | Qwen3.5-27B | Dense, text-only QLoRA + FSDP2 | — | +| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — | +| `35b-a3b-moe-qlora-fsdp.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA + FSDP2 | — | +| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — | +| `122b-a10b-moe-qlora-fsdp.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA + FSDP2 | — | + +### Gated DeltaNet Linear Attention + +Qwen3.5 interleaves standard attention with Gated DeltaNet linear attention layers. To apply LoRA to them, add to `lora_target_modules`: + +```yaml +lora_target_modules: + # ... standard projections ... + - linear_attn.in_proj_qkv + - linear_attn.in_proj_z + - linear_attn.out_proj ``` -> FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there. -4. Run a finetuning example: +### Routed Experts (MoE) -```bash -# Dense 27B text-only (QLoRA, ~47 GiB VRAM with sample packing) -axolotl train examples/qwen3.5/27b-qlora.yaml +To apply LoRA to routed expert parameters, add `lora_target_parameters`: -# Dense 27B text-only FFT with vision encoder frozen (~53 GiB, single 80 GiB GPU) -axolotl train examples/qwen3.5/27b-fft.yaml +```yaml +lora_target_parameters: + - mlp.experts.gate_up_proj + - mlp.experts.down_proj +# - mlp.gate.weight # router +``` -# MoE 35B-A3B text-only (QLoRA) -axolotl train examples/qwen3.5/35b-a3b-moe-qlora.yaml +### Shared Experts (MoE) -# MoE 122B-A10B text-only (QLoRA) -axolotl train examples/qwen3.5/122b-a10b-moe-qlora.yaml - -# 9B vision+text (LoRA, multimodal dataset) -axolotl train examples/qwen3.5/9b-lora-vision.yaml - -# 9B vision+text FFT, single 80 GiB GPU (~61 GiB peak) -axolotl train examples/qwen3.5/9b-fft-vision.yaml +Routed experts and shared experts both have `gate_up_proj`/`down_proj`, so a plain module name in `lora_target_modules` would match both. Use a regex to target only attention and shared expert projections, while `lora_target_parameters` above handles routed experts separately: +```yaml +lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj' ``` ### TIPS -- For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`. -- For **text-only FFT** on 27B, use `27b-fft.yaml` which sets `unfrozen_parameters` to freeze the vision encoder (`model.visual.*`) — this avoids wasting optimizer state on parameters that receive no gradient from text-only data. +- For inference hyp, please see the respective model card details. - You can run a full finetuning of smaller configs by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below. - Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `9b-lora-vision.yaml`. -- The Gated DeltaNet linear attention layers (`linear_attn.*`) can optionally be added to `lora_target_modules` — they are commented out by default. ## Optimization Guides