diff --git a/examples/llama-4/README.md b/examples/llama-4/README.md new file mode 100644 index 000000000..53448da2b --- /dev/null +++ b/examples/llama-4/README.md @@ -0,0 +1,10 @@ +# Llama 4 by Meta AI + +## Available Examples + +### Llama 4 Scout 17Bx16Experts (109B) +- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml) +- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml) +- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml) + +Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. diff --git a/examples/llama-4/scout-qlora-single-h100.yaml b/examples/llama-4/scout-qlora-single-h100.yaml new file mode 100644 index 000000000..23a3a2195 --- /dev/null +++ b/examples/llama-4/scout-qlora-single-h100.yaml @@ -0,0 +1,86 @@ +base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 +model_type: Llama4ForConditionalGeneration +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +strict: false + +plugins: + - axolotl.integrations.liger.LigerPlugin + +liger_glu_activation: true +liger_rms_norm: true +liger_layer_norm: true + +llama4_linearized_experts: true +load_in_4bit: true +adapter: qlora +lora_r: 32 +lora_alpha: 64 +lora_target_modules: + - self_attn.q_proj + - self_attn.k_proj + - self_attn.v_proj + - self_attn.o_proj + - shared_expert.gate_proj + - shared_expert.up_proj + - shared_expert.down_proj + # - experts.gate_projs.[0-9]+$ + # - experts.up_projs.[0-9]+$ + # - experts.down_projs.[0-9]+$ +lora_modules_to_save: + # - lm_head + # - embed_tokens + +lora_mlp_kernel: true +lora_qkv_kernel: true +lora_o_kernel: true + +chat_template: llama4 +datasets: + - path: mlabonne/FineTome-100k + type: chat_template + split: train[:20%] + field_messages: conversations + message_property_mappings: + role: from + content: value + +dataset_prepared_path: last_run_prepared +val_set_size: 0.0 +output_dir: ./outputs/out + +sequence_len: 4096 # up to 8k will work on a single H100 +sample_packing: true +pad_to_sequence_len: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_torch_4bit +lr_scheduler: cosine +learning_rate: 1e-4 + +bf16: true +tf32: true + +logging_steps: 1 +flash_attention: true + +gradient_checkpointing: offload +gradient_checkpointing_kwargs: + use_reentrant: false + +warmup_steps: 20 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: + pad_token: <|finetune_right_pad_id|> + eos_token: <|eot|> diff --git a/examples/llama-4/scout-lora.yaml b/examples/llama-4/scout-vision-qlora-fsdp.yaml similarity index 51% rename from examples/llama-4/scout-lora.yaml rename to examples/llama-4/scout-vision-qlora-fsdp.yaml index 26534b560..8b8c9abd1 100644 --- a/examples/llama-4/scout-lora.yaml +++ b/examples/llama-4/scout-vision-qlora-fsdp.yaml @@ -1,13 +1,28 @@ -base_model: meta-llama/Llama-4-Scout-17B-16E +base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16 model_type: Llama4ForConditionalGeneration +processor_type: Llama4Processor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name strict: false - # torch_compile: true +# these 3 lines are needed for now to handle vision chat templates w images +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false -adapter: lora +sequence_len: 4096 + +plugins: + - axolotl.integrations.liger.LigerPlugin + +liger_glu_activation: true +liger_rms_norm: true +liger_layer_norm: true + +llama4_linearized_experts: true # use Axolotl's customized model +load_in_4bit: true +adapter: qlora lora_r: 32 lora_alpha: 64 lora_target_modules: @@ -15,60 +30,59 @@ lora_target_modules: - self_attn.k_proj - self_attn.v_proj - self_attn.o_proj + - shared_expert.gate_proj + - shared_expert.up_proj + - shared_expert.down_proj + - vision_adapter.mlp.fc1 + - vision_adapter.mlp.fc2 + # - experts.gate_projs.[0-9]+$ + # - experts.up_projs.[0-9]+$ + # - experts.down_projs.[0-9]+$ lora_modules_to_save: - lm_head - embed_tokens chat_template: llama4 datasets: - - path: mlabonne/FineTome-100k + - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template - split: train[:20%] - field_messages: conversations - message_property_mappings: - role: from - content: value + split: train[:1%] + field_messages: messages dataset_prepared_path: last_run_prepared val_set_size: 0.0 output_dir: ./outputs/out -sequence_len: 4096 -sample_packing: true -pad_to_sequence_len: true - gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 1 -optimizer: adamw_torch_8bit +optimizer: adamw_torch_4bit lr_scheduler: cosine learning_rate: 2e-5 bf16: true tf32: true -# gradient_checkpointing: true -# gradient_checkpointing_kwargs: -# use_reentrant: false logging_steps: 1 flash_attention: true warmup_steps: 100 -evals_per_epoch: 2 +evals_per_epoch: 1 saves_per_epoch: 1 weight_decay: 0.0 fsdp: - auto_wrap - full_shard fsdp_config: - fsdp_version: 2 - fsdp_offload_params: false + fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer + fsdp_limit_all_gathers: true + fsdp_sync_module_states: true + fsdp_offload_params: true + fsdp_use_orig_params: false fsdp_cpu_ram_efficient_loading: true fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP - fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer - fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_state_dict_type: FULL_STATE_DICT fsdp_sharding_strategy: FULL_SHARD - fsdp_reshard_after_forward: true fsdp_activation_checkpointing: true special_tokens: pad_token: <|finetune_right_pad_id|>