diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd index e5753732d..e2a9440e8 100644 --- a/docs/multimodal.qmd +++ b/docs/multimodal.qmd @@ -20,6 +20,7 @@ format: - [Gemma-3n](#sec-gemma-3n) - [Qwen2-VL](#sec-qwen2-vl) - [Qwen2.5-VL](#sec-qwen25-vl) +- [Qwen3.5](#sec-qwen3-5) - [GLM-4.6V](#sec-glm-4-6v) - [SmolVLM2](#sec-smolvlm2) - [LFM2-VL](#sec-lfm2-vl) @@ -191,6 +192,14 @@ base_model: Qwen/Qwen3-VL-4B-Instruct chat_template: qwen2_vl # same as qwen2-vl ``` +### Qwen3.5 {#sec-qwen3-5} + +```yaml +base_model: Qwen/Qwen3.5-9B + +chat_template: qwen3_5 +``` + ### GLM-4.6V {#sec-glm-4-6v} Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported. diff --git a/examples/qwen3.5/27b-fft.yaml b/examples/qwen3.5/27b-fft.yaml new file mode 100644 index 000000000..6e61bc695 --- /dev/null +++ b/examples/qwen3.5/27b-fft.yaml @@ -0,0 +1,59 @@ +base_model: Qwen/Qwen3.5-27B +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +# Full fine-tune (FFT) of the text-only path of Qwen3.5-27B. + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin +strict: false + +chat_template: qwen3_5 +datasets: + - path: mlabonne/FineTome-100k + type: chat_template + split: train[:20%] + field_messages: conversations + message_property_mappings: + role: from + content: value +val_set_size: 0.0 +output_dir: ./outputs/out +dataset_prepared_path: last_run_prepared + +sequence_len: 2048 +sample_packing: true + +# Freeze vision encoder +unfrozen_parameters: + - model\.language_model\..* + - lm_head\..* + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 2 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: true + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 4 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: diff --git a/examples/qwen3.5/9b-fft-vision.yaml b/examples/qwen3.5/9b-fft-vision.yaml new file mode 100644 index 000000000..b6aeb859d --- /dev/null +++ b/examples/qwen3.5/9b-fft-vision.yaml @@ -0,0 +1,49 @@ +base_model: Qwen/Qwen3.5-9B +processor_type: AutoProcessor + +# Required for multimodal training +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false + +chat_template: qwen3_5 +datasets: + - path: HuggingFaceH4/llava-instruct-mix-vsft + type: chat_template + split: train[:1%] + +dataset_prepared_path: last_run_prepared +val_set_size: 0.0 +output_dir: ./outputs/out + +sequence_len: 4096 +pad_to_sequence_len: false + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: true + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: diff --git a/examples/qwen3.5/7b-lora-vision.yaml b/examples/qwen3.5/9b-lora-vision.yaml similarity index 84% rename from examples/qwen3.5/7b-lora-vision.yaml rename to examples/qwen3.5/9b-lora-vision.yaml index 79179ec96..9fb222901 100644 --- a/examples/qwen3.5/7b-lora-vision.yaml +++ b/examples/qwen3.5/9b-lora-vision.yaml @@ -1,10 +1,6 @@ -base_model: Qwen/Qwen3.5-7B +base_model: Qwen/Qwen3.5-9B processor_type: AutoProcessor -# Qwen3.5-7B and above are early-fusion VLMs (Qwen3_5ForConditionalGeneration). -# Vision and text tokens are processed together by the same transformer layers. -# Note: Qwen3.5-2B is a text-only model — the smallest VLM is Qwen3.5-7B. - # These 3 lines are required for vision/multimodal training skip_prepare_dataset: true remove_unused_columns: false diff --git a/examples/qwen3.5/README.md b/examples/qwen3.5/README.md index 8a2f9b4bd..1b64bc78d 100644 --- a/examples/qwen3.5/README.md +++ b/examples/qwen3.5/README.md @@ -1,15 +1,20 @@ # Finetune Qwen3.5 with Axolotl -[Qwen3.5](https://huggingface.co/collections/Qwen/qwen35-68452f3bc6e4b7cfb4e1c803) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. Models from 7B onwards are early-fusion vision-language models (`Qwen3_5ForConditionalGeneration`), meaning vision and text tokens are processed through the same transformer stack. The 2B variant is text-only. +[Qwen3.5](https://huggingface.co/collections/Qwen/qwen35) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. All Qwen3.5 models are early-fusion vision-language models: dense variants use `Qwen3_5ForConditionalGeneration` and MoE variants use `Qwen3_5MoeForConditionalGeneration`. + +Vision and text tokens are processed through the same transformer stack. The configs below train on text-only data unless noted otherwise. See `9b-lora-vision.yaml` for a multimodal example. Available configs: -| Config | Model | Type | -|---|---|---| -| `27b-qlora.yaml` | Qwen3.5-27B | Dense VLM, text-only path | -| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only path | -| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only path | -| `7b-lora-vision.yaml` | Qwen3.5-7B | Vision+text (multimodal) | +| Config | Model | Type | Peak VRAM | +|---|---|---|---| +| `27b-qlora.yaml` | Qwen3.5-27B | Dense VLM, text-only QLoRA | ~47 GiB | +| `27b-fft.yaml` | Qwen3.5-27B | Dense VLM, text-only FFT (vision frozen) | ~53 GiB | +| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — | +| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — | +| `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — | +| `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB | + ## Getting started @@ -29,23 +34,31 @@ pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1 # Dense 27B text-only (QLoRA, ~47 GiB VRAM with sample packing) axolotl train examples/qwen3.5/27b-qlora.yaml +# Dense 27B text-only FFT with vision encoder frozen (~53 GiB, single 80 GiB GPU) +axolotl train examples/qwen3.5/27b-fft.yaml + # MoE 35B-A3B text-only (QLoRA) axolotl train examples/qwen3.5/35b-a3b-moe-qlora.yaml # MoE 122B-A10B text-only (QLoRA) axolotl train examples/qwen3.5/122b-a10b-moe-qlora.yaml -# 7B vision+text (LoRA, multimodal dataset) -axolotl train examples/qwen3.5/7b-lora-vision.yaml +# 9B vision+text (LoRA, multimodal dataset) +axolotl train examples/qwen3.5/9b-lora-vision.yaml + +# 9B vision+text FFT, single 80 GiB GPU (~61 GiB peak) +axolotl train examples/qwen3.5/9b-fft-vision.yaml + ``` ### TIPS - For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`. -- You can run a full finetuning by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below. +- For **text-only FFT** on 27B, use `27b-fft.yaml` which sets `unfrozen_parameters` to freeze the vision encoder (`model.visual.*`) — this avoids wasting optimizer state on parameters that receive no gradient from text-only data. +- You can run a full finetuning of smaller configs by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below. - Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). -- For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `7b-lora-vision.yaml`. +- For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `9b-lora-vision.yaml`. - The Gated DeltaNet linear attention layers (`linear_attn.*`) can optionally be added to `lora_target_modules` — they are commented out by default. ## Optimization Guides