From 9f986f5e71530557b59182df38d2b19858e7b440 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 9 Apr 2025 14:01:28 -0400 Subject: [PATCH] Add Llama4 maverick examples (#2512) --- examples/llama-4/README.md | 8 +- examples/llama-4/maverick-qlora-fsdp1.yaml | 89 ++++++++++++++++++++++ 2 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 examples/llama-4/maverick-qlora-fsdp1.yaml diff --git a/examples/llama-4/README.md b/examples/llama-4/README.md index 53448da2b..a0ec1c70e 100644 --- a/examples/llama-4/README.md +++ b/examples/llama-4/README.md @@ -7,4 +7,10 @@ - [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml) - [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml) -Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. +Our Single H100 implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-sft/runs/zic56rhd) + +### Llama 4 Maverick 17Bx128Experts (400B) + +- [Text Multi GPU QLoRA w/FSDP1](./maverick-qlora-fsdp1.yaml) + +Our 4xH100 implementation for Llama 4 Maverick uses 79.5GB VRAM/GPU for post-training with 4k context length @ 206 tokens/second. [WandB logs here.](https://wandb.ai/axolotl-ai/llama-sft/runs/siyvwuxc?nw=nwuserwinglian) diff --git a/examples/llama-4/maverick-qlora-fsdp1.yaml b/examples/llama-4/maverick-qlora-fsdp1.yaml new file mode 100644 index 000000000..232afc73e --- /dev/null +++ b/examples/llama-4/maverick-qlora-fsdp1.yaml @@ -0,0 +1,89 @@ +base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16 +model_type: Llama4ForConditionalGeneration +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +strict: false + +plugins: + - axolotl.integrations.liger.LigerPlugin + +liger_glu_activation: true +liger_rms_norm: true +liger_layer_norm: true + +llama4_linearized_experts: true +load_in_4bit: true +adapter: qlora +lora_r: 32 +lora_alpha: 64 +lora_target_modules: + - self_attn.q_proj + - self_attn.k_proj + - self_attn.v_proj + - self_attn.o_proj + - shared_expert.gate_proj + - shared_expert.up_proj + - shared_expert.down_proj + # - experts.gate_projs.[0-9]+$ + # - experts.up_projs.[0-9]+$ + # - experts.down_projs.[0-9]+$ +lora_modules_to_save: +# - lm_head +# - embed_tokens + +chat_template: llama4 +datasets: + - path: mlabonne/FineTome-100k + type: chat_template + split: train[:20%] + field_messages: conversations + message_property_mappings: + role: from + content: value + +dataset_prepared_path: last_run_prepared +val_set_size: 0.0 +output_dir: ./outputs/out + +sequence_len: 4096 +sample_packing: true +pad_to_sequence_len: true + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_fused +lr_scheduler: cosine +learning_rate: 1e-4 + +bf16: true +tf32: true + +logging_steps: 1 +flash_attention: true + +gradient_checkpointing: offload +gradient_checkpointing_kwargs: + use_reentrant: false + +warmup_steps: 20 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 +fsdp: + - auto_wrap + - full_shard +fsdp_config: + fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer + fsdp_limit_all_gathers: true + fsdp_sync_module_states: true + fsdp_offload_params: true + fsdp_use_orig_params: false + fsdp_cpu_ram_efficient_loading: true + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_sharding_strategy: FULL_SHARD +special_tokens: + pad_token: <|finetune_right_pad_id|> + eos_token: <|eot|>