Add Llama4 maverick examples

2025-04-09 08:27:46 -04:00
138 changed files with 222 additions and 329 deletions
--- a/1
+++ b/1
@@ -1 +0,0 @@
 docs.axolotl.ai
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ axolotl fetch examples
 axolotl fetch deepspeed_configs  # OPTIONAL
 ```
-Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
+Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
 ### Your First Fine-tune
@@ -78,7 +78,7 @@ axolotl fetch examples --dest path/to/folder
 axolotl train examples/llama-3/lora-1b.yml
 ```
-That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
+That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
 ## ✨ Key Features
@@ -91,20 +91,20 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 ## 📚 Documentation
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
+- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
+- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
+- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
+- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
+- [API Reference](https://axolotl-ai-cloud.github.io/axolotl/docs/api/) - Auto-generated code documentation
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
+- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
 ## 🤝 Getting Help
 - Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
 - Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
+- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
 - Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
 ## 🌟 Contributing
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -90,7 +90,7 @@ lora_on_cpu: true
 # List[str]. Add plugins to extend the pipeline.
 # See `src/axolotl/integrations` for the available plugins or doc below for more details.
-# https://docs.axolotl.ai/docs/custom_integrations.html
+# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
 plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
@@ -394,7 +394,7 @@ lora_fan_in_fan_out: false
 # Apply custom LoRA autograd functions and activation function Triton kernels for
 # speed and memory savings
-# See: https://docs.axolotl.ai/docs/lora_optims.html
+# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
@@ -688,7 +688,7 @@ ddp_broadcast_buffers:
 # Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
 # E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
 # subsequences, or set to 4 to split into four equal-sized subsequences.
-# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
+# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
 sequence_parallel_degree:
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 # Must evenly divide the number of KV heads in your model.
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -457,7 +457,10 @@ datasets:
    type: alpaca
 ```
-Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.
+Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
 Reference: [Instruction Dataset Documentation](inst_tune.qmd).
 #### Custom Instruct Prompt Format
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -36,9 +36,6 @@ deepspeed: deepspeed_configs/zero1.json
 ### Usage {#sec-deepspeed-usage}
 ```{.bash}
 # Fetch deepspeed configs (if not already present)
 axolotl fetch deepspeed_configs
 # Passing arg via config
 axolotl train config.yml
@@ -51,20 +48,10 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
 We provide default configurations for:
 - ZeRO Stage 1 (`zero1.json`)
 - ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
 - ZeRO Stage 2 (`zero2.json`)
 - ZeRO Stage 3 (`zero3.json`)
 - ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
 - ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
 - ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)
-::: {.callout-tip}
+Choose based on your memory requirements and performance needs.
 Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
 Start from Stage 1 -> Stage 2 -> Stage 3.
 :::
 ## FSDP {#sec-fsdp}
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -530,7 +530,7 @@ trl:
 ```
 ```bash
-CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
+CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
 ```
 Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
--- a/examples/cerebras/btlm-ft.yml
+++ b/examples/cerebras/btlm-ft.yml
@@ -8,6 +8,7 @@ tokenizer_type: GPT2Tokenizer
 trust_remote_code: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -4,6 +4,7 @@ base_model: cerebras/Cerebras-GPT-1.3B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/code-llama/13b/lora.yml
+++ b/examples/code-llama/13b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/13b/qlora.yml
+++ b/examples/code-llama/13b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/34b/lora.yml
+++ b/examples/code-llama/34b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/34b/qlora.yml
+++ b/examples/code-llama/34b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/7b/lora.yml
+++ b/examples/code-llama/7b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/code-llama/7b/qlora.yml
+++ b/examples/code-llama/7b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: CodeLlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -4,6 +4,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: cohere
--- a/examples/dbrx/16bit-lora.yaml
+++ b/examples/dbrx/16bit-lora.yaml
@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/dbrx/8bit-lora.yaml
+++ b/examples/dbrx/8bit-lora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/dbrx/fft-ds-zero3.yaml
+++ b/examples/dbrx/fft-ds-zero3.yaml
@@ -3,6 +3,7 @@ base_model: LnL-AI/dbrx-base-converted-v2
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/deepcoder/deepcoder-14B-preview-lora.yml
@@ -1,58 +0,0 @@
 base_model: agentica-org/DeepCoder-14B-Preview
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -1,58 +0,0 @@
 base_model: deepcogito/cogito-v1-preview-llama-3B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -1,58 +0,0 @@
 base_model: deepcogito/cogito-v1-preview-qwen-14B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    field_messages: messages
    message_property_mappings:
      role: role
      content: content
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/lora-out
 sequence_len: 4096
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -2,6 +2,7 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 plugins:
--- a/examples/falcon/config-7b-lora.yml
+++ b/examples/falcon/config-7b-lora.yml
@@ -11,6 +11,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/falcon/config-7b-qlora.yml
+++ b/examples/falcon/config-7b-qlora.yml
@@ -15,6 +15,7 @@ load_in_8bit: false
 # enable 4bit for QLoRA
 load_in_4bit: true
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: QingyiSi/Alpaca-CoT
--- a/examples/falcon/config-7b.yml
+++ b/examples/falcon/config-7b.yml
@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
 # required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
 trust_remote_code: true
 gptq: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/gemma/qlora.yml
+++ b/examples/gemma/qlora.yml
@@ -8,6 +8,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 datasets:
--- a/examples/gemma2/qlora.yml
+++ b/examples/gemma2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: gemma
--- a/examples/gemma2/reward-model.yaml
+++ b/examples/gemma2/reward-model.yaml
@@ -5,6 +5,7 @@ num_labels: 1
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 reward_model: true
 chat_template: gemma
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -10,6 +10,7 @@ ddp_find_unused_parameters: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 # huggingface repo
 chat_template: gemma3
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -1,4 +1,5 @@
 base_model: google/gemma-3-4b-it
 strict: false
 load_in_4bit: true
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -1,5 +1,6 @@
 base_model: google/gemma-3-4b-it
 processor_type: AutoProcessor
 strict: false
 load_in_4bit: true
--- a/examples/gptj/qlora.yml
+++ b/examples/gptj/qlora.yml
@@ -4,6 +4,7 @@ base_model: EleutherAI/gpt-j-6b
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -5,6 +5,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
 use_tensorboard: true
 chat_template: jamba
 datasets:
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -10,6 +10,7 @@ gptq_disable_exllama: true
 tokenizer_use_fast: true
 tokenizer_legacy: true
 strict: false
 push_dataset_to_hub:
 hf_use_auth_token: true
 datasets:
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: yahma/alpaca-cleaned
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -5,6 +5,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3-vision/lora-11b.yaml
+++ b/examples/llama-3-vision/lora-11b.yaml
@@ -4,6 +4,7 @@ processor_type: AutoProcessor
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -9,6 +9,7 @@ liger_rms_norm: true
 liger_glu_activation: true
 liger_fused_linear_cross_entropy: true
 strict: false
 chat_template: llama3
 datasets:
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Meta-Llama-3.1-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 rl: dpo
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 datasets:
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: llama3
 rl: dpo
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -1,6 +1,7 @@
 base_model: NousResearch/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -4,6 +4,7 @@ base_model: meta-llama/Llama-3.2-1B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 rl: kto
 rl_beta: 0.5
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -4,6 +4,7 @@ base_model: NousResearch/Llama-3.2-1B
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer  # PreTrainedTokenizerFast
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: aaditya/alpaca_subset_1
--- a/examples/llama-4/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/maverick-qlora-fsdp1.yaml
@@ -3,6 +3,7 @@ model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 plugins:
  - axolotl.integrations.liger.LigerPlugin
--- a/examples/llama-4/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/scout-qlora-fsdp1.yaml
@@ -3,6 +3,7 @@ model_type: Llama4ForConditionalGeneration
  # Automatically upload checkpoint and final model to HF
  # hub_model_id: username/custom_model_name
 strict: false
 # torch_compile: true
 plugins:
--- a/examples/llama-4/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/scout-qlora-single-h100.yaml
@@ -3,6 +3,7 @@ model_type: Llama4ForConditionalGeneration
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 plugins:
  - axolotl.integrations.liger.LigerPlugin
--- a/examples/llama-4/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp.yaml
@@ -4,6 +4,7 @@ processor_type: Llama4Processor
  # Automatically upload checkpoint and final model to HF
  # hub_model_id: username/custom_model_name
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -1,5 +1,6 @@
 base_model: llava-hf/llava-1.5-7b-hf
 processor_type: AutoProcessor
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -5,6 +5,7 @@ tokenizer_type: AutoTokenizer
 tokenizer_config: EleutherAI/gpt-neox-20b
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/bigstral-ds-zero3.yaml
+++ b/examples/mistral/bigstral-ds-zero3.yaml
@@ -6,6 +6,7 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 unfrozen_parameters:
  - ^lm_head.weight$
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -4,6 +4,7 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -4,6 +4,7 @@ model_type: MistralForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -12,6 +12,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 chat_template: chatml
 rl: dpo
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 rl: orpo
 orpo_alpha: 0.1
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -1,5 +1,6 @@
 base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
 processor_type: AutoProcessor
 strict: false
 load_in_8bit: true
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/mistral/mixtral_22.yml
+++ b/examples/mistral/mixtral_22.yml
@@ -6,6 +6,7 @@ tokenizer_type: LlamaTokenizer
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 unfrozen_parameters:
  - ^lm_head.weight$
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/openllama-3b/config.yml
+++ b/examples/openllama-3b/config.yml
@@ -4,6 +4,7 @@ model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/openllama-3b/lora.yml
+++ b/examples/openllama-3b/lora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/openllama-3b/qlora.yml
+++ b/examples/openllama-3b/qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: LlamaTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 push_dataset_to_hub:
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 chat_template: phi_3
 datasets:
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -4,6 +4,7 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: garage-bAInd/Open-Platypus
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: garage-bAInd/Open-Platypus
--- a/examples/phi/phi2-ft.yml
+++ b/examples/phi/phi2-ft.yml
@@ -4,6 +4,7 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: garage-bAInd/Open-Platypus
--- a/examples/phi/phi3-ft-fsdp.yml
+++ b/examples/phi/phi3-ft-fsdp.yml
@@ -4,6 +4,7 @@ model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/phi/phi3-ft.yml
+++ b/examples/phi/phi3-ft.yml
@@ -7,6 +7,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 chat_template: phi_3
 strict: false
 datasets:
  - path: garage-bAInd/Open-Platypus
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -1,5 +1,6 @@
 base_model: mistral-community/pixtral-12b
 processor_type: AutoProcessor
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: true
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -9,6 +9,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/qwen/qwen2-moe-lora.yaml
+++ b/examples/qwen/qwen2-moe-lora.yaml
@@ -3,6 +3,7 @@ base_model: Qwen/Qwen1.5-MoE-A2.7B
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/qwen/qwen2-moe-qlora.yaml
+++ b/examples/qwen/qwen2-moe-qlora.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -1,5 +1,6 @@
 base_model: Qwen/Qwen2-VL-7B-Instruct
 processor_type: AutoProcessor
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
--- a/examples/qwen2/dpo.yaml
+++ b/examples/qwen2/dpo.yaml
@@ -2,6 +2,7 @@ base_model: Qwen/Qwen2.5-0.5B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 chat_template: qwen_25
 rl: dpo
--- a/examples/qwen2/prm.yaml
+++ b/examples/qwen2/prm.yaml
@@ -5,6 +5,7 @@ num_labels: 2
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 process_reward_model: true
 chat_template:
--- a/examples/qwen2/qlora-fsdp.yaml
+++ b/examples/qwen2/qlora-fsdp.yaml
@@ -6,6 +6,7 @@ trust_remote_code: true
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
--- a/examples/qwen2/reward-model.yaml
+++ b/examples/qwen2/reward-model.yaml
@@ -5,6 +5,7 @@ num_labels: 1
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 strict: false
 reward_model: true
 chat_template: qwen_25
--- a/examples/stablelm-2/1.6b/fft.yml
+++ b/examples/stablelm-2/1.6b/fft.yml
@@ -6,6 +6,7 @@ tokenizer_type: AutoTokenizer
 # hub_model_id: username/custom_model_name
 trust_remote_code: true
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
--- a/Show More
+++ b/Show More