From 9f986f5e71530557b59182df38d2b19858e7b440 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 9 Apr 2025 14:01:28 -0400
Subject: [PATCH] Add Llama4 maverick examples (#2512)

---
 examples/llama-4/README.md                 |  8 +-
 examples/llama-4/maverick-qlora-fsdp1.yaml | 89 ++++++++++++++++++++++
 2 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 examples/llama-4/maverick-qlora-fsdp1.yaml

diff --git a/examples/llama-4/README.md b/examples/llama-4/README.md
index 53448da2b..a0ec1c70e 100644
--- a/examples/llama-4/README.md
+++ b/examples/llama-4/README.md
@@ -7,4 +7,10 @@
 - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
 - [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
 
-Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second.
+Our Single H100 implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-sft/runs/zic56rhd)
+
+### Llama 4 Maverick 17Bx128Experts (400B)
+
+- [Text Multi GPU QLoRA w/FSDP1](./maverick-qlora-fsdp1.yaml)
+
+Our 4xH100 implementation for Llama 4 Maverick uses 79.5GB VRAM/GPU for post-training with 4k context length @ 206 tokens/second. [WandB logs here.](https://wandb.ai/axolotl-ai/llama-sft/runs/siyvwuxc?nw=nwuserwinglian)
diff --git a/examples/llama-4/maverick-qlora-fsdp1.yaml b/examples/llama-4/maverick-qlora-fsdp1.yaml
new file mode 100644
index 000000000..232afc73e
--- /dev/null
+++ b/examples/llama-4/maverick-qlora-fsdp1.yaml
@@ -0,0 +1,89 @@
+base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
+model_type: Llama4ForConditionalGeneration
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+strict: false
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_glu_activation: true
+liger_rms_norm: true
+liger_layer_norm: true
+
+llama4_linearized_experts: true
+load_in_4bit: true
+adapter: qlora
+lora_r: 32
+lora_alpha: 64
+lora_target_modules:
+  - self_attn.q_proj
+  - self_attn.k_proj
+  - self_attn.v_proj
+  - self_attn.o_proj
+  - shared_expert.gate_proj
+  - shared_expert.up_proj
+  - shared_expert.down_proj
+  # - experts.gate_projs.[0-9]+$
+  # - experts.up_projs.[0-9]+$
+  # - experts.down_projs.[0-9]+$
+lora_modules_to_save:
+# - lm_head
+# - embed_tokens
+
+chat_template: llama4
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+pad_to_sequence_len: true
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 1e-4
+
+bf16: true
+tf32: true
+
+logging_steps: 1
+flash_attention: true
+
+gradient_checkpointing: offload
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+
+warmup_steps: 20
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+fsdp:
+  - auto_wrap
+  - full_shard
+fsdp_config:
+  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
+  fsdp_limit_all_gathers: true
+  fsdp_sync_module_states: true
+  fsdp_offload_params: true
+  fsdp_use_orig_params: false
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sharding_strategy: FULL_SHARD
+special_tokens:
+  pad_token: <|finetune_right_pad_id|>
+  eos_token: <|eot|>