gemma4 support (#3574)

* gemma4 support * fixes * chore: lint
2026-04-02 17:46:46 -04:00
parent 573726c839
commit 08fc7de87e
16 changed files with 2082 additions and 45 deletions
--- a/examples/gemma4/26b-a4b-moe-qlora.yaml
+++ b/examples/gemma4/26b-a4b-moe-qlora.yaml
@@ -0,0 +1,104 @@
+# Gemma 4 26B-A4B MoE QLoRA with ScatterMoE kernels
+#
+# Validated: 50 steps on FineTome-100k, loss 7.4 -> 2.4, single RTX 5090 (32GB)
+#
+# Key notes:
+# - Flash Attention 2 is NOT supported (global_head_dim=512 > FA2 max of 256).
+#   Use sdp_attention instead.
+# - Gemma 4 is multimodal (text+vision+audio). For text-only SFT, restrict
+#   LoRA to the text backbone via lora_target_linear_modules regex.
+# - MoE experts use `experts_implementation: scattermoe` — Gemma 4 embeds MoE
+#   directly in the decoder layer (no SparseMoeBlock), so we register ScatterMoE
+#   via the transformers ExpertsInterface.
+# - Expert LoRA targets are `experts.gate_up_proj` / `experts.down_proj`
+#   (no `mlp.` prefix, unlike Qwen/Mixtral).
+# - micro_batch_size: 1 fits 2048 seq_len on 32GB GPU with SDP attention.
+#   Use micro_batch_size: 4 with 1024 seq_len, or on 48GB+ GPUs.
+
+base_model: google/gemma-4-26B-A4B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.kernels.KernelsPlugin
+  - axolotl.integrations.liger.LigerPlugin
+use_kernels: true
+use_scattermoe: true
+experts_implementation: scattermoe
+torch_compile: false
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+strict: false
+
+chat_template: gemma4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:10%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+val_set_size: 0.05
+output_dir: ./outputs/gemma4-26b-a4b-qlora
+
+sequence_len: 2048
+sample_packing: true
+
+load_in_4bit: true
+quantize_moe_experts: true
+adapter: qlora
+lora_r: 16
+lora_alpha: 32
+lora_dropout: 0
+
+# Restrict LoRA to text backbone only (skip vision/audio encoders).
+# lora_target_modules is intentionally empty — all module targeting is done
+# via regex in lora_target_linear_modules below.
+lora_target_modules: []
+lora_target_linear_modules:
+  - language_model\.model\.layers\.\d+\.self_attn\.(q|k|v|o)_proj
+
+# MoE expert LoRA (3D Parameter tensors, not nn.Linear)
+lora_target_parameters:
+  - experts.gate_up_proj
+  - experts.down_proj
+
+lora_mlp_kernel: false
+lora_qkv_kernel: false
+lora_o_kernel: false
+
+bnb_config_kwargs:
+  bnb_4bit_use_double_quant: true
+
+wandb_project: gemma4-qlora
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+activation_offloading: true
+logging_steps: 1
+
+# FA2 not supported — Gemma4 global_head_dim=512 exceeds FA2 max of 256
+flash_attention: false
+sdp_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 4
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens: