From 4f5e8a328a18522d2e46a556e9afa6e5431086cb Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 25 Dec 2025 18:09:03 +0700 Subject: [PATCH] Feat: add MiMo and Plano (#3332) [skip-ci] * feat: add xiaomi's mimo 7b * fix: pin revision * fix: update trinity docs and pin revision * fix: wrong config name * feat: add vram usage * feat: add plano * feat: update plano vram usage * chore: comments --- README.md | 2 +- examples/mimo/README.md | 39 +++++++++++ examples/mimo/mimo-7b-qlora.yaml | 67 +++++++++++++++++++ examples/plano/README.md | 42 ++++++++++++ examples/plano/plano-4b-qlora.yaml | 65 ++++++++++++++++++ examples/trinity/README.md | 4 ++ .../trinity/trinity-nano-preview-qlora.yaml | 1 + 7 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 examples/mimo/README.md create mode 100644 examples/mimo/mimo-7b-qlora.yaml create mode 100644 examples/plano/README.md create mode 100644 examples/plano/plano-4b-qlora.yaml diff --git a/README.md b/README.md index 65cd5eaa6..6b9a6f84b 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ ## 🎉 Latest Updates -- 2025/12: Axolotl now includes support for [Kimi-Linear](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/kimi-linear), [InternVL 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/internvl3_5), [Olmo3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/olmo3), [Trinity](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/trinity), and [Ministral3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/ministral3). +- 2025/12: Axolotl now includes support for [Kimi-Linear](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/kimi-linear), [Plano-Orchestrator](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/plano), [MiMo](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mimo), [InternVL 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/internvl3_5), [Olmo3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/olmo3), [Trinity](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/trinity), and [Ministral3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/ministral3). - 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/qwen3-next), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3), [Granite 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/granite4), [HunYuan](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/hunyuan), [Magistral 2509](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral#vision), [Apertus](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/apertus), and [Seed-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/seed-oss). - 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion). - 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107). diff --git a/examples/mimo/README.md b/examples/mimo/README.md new file mode 100644 index 000000000..5ae214343 --- /dev/null +++ b/examples/mimo/README.md @@ -0,0 +1,39 @@ +# Finetune Xiaomi's MiMo with Axolotl + +[MiMo](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL) is a family of models trained from scratch for reasoning tasks, incorporating **Multiple-Token Prediction (MTP)** as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density. + +This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. + +## Getting started + +1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). + +2. Run the finetuning example: + + ```bash + axolotl train examples/mimo/mimo-7b-qlora.yaml + ``` + +This config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 + +### Tips + +- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. +- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). +- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). + +## Optimization Guides + +Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). + +## Limitations + +**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for MiMo in the near future. + +## Related Resources + +- [MiMo Paper](https://arxiv.org/abs/2505.07608) +- [Axolotl Docs](https://docs.axolotl.ai) +- [Axolotl Website](https://axolotl.ai) +- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) +- [Axolotl Discord](https://discord.gg/7m9sfhzaf3) diff --git a/examples/mimo/mimo-7b-qlora.yaml b/examples/mimo/mimo-7b-qlora.yaml new file mode 100644 index 000000000..689213bcd --- /dev/null +++ b/examples/mimo/mimo-7b-qlora.yaml @@ -0,0 +1,67 @@ +base_model: XiaomiMiMo/MiMo-7B-RL +trust_remote_code: true +revision_of_model: 6299b5a + +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +# CCE - N/A as of now +# plugins: +# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +load_in_8bit: false +load_in_4bit: true + +datasets: + - path: fozziethebeat/alpaca_messages_2k_test + type: chat_template + +dataset_prepared_path: last_run_prepared +val_set_size: 0.1 +output_dir: ./outputs/lora-out + +adapter: qlora +lora_model_dir: + +sequence_len: 2048 +sample_packing: true + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 + +# save_first_step: true # uncomment this to validate checkpoint saving works with your config diff --git a/examples/plano/README.md b/examples/plano/README.md new file mode 100644 index 000000000..e3f4d14dc --- /dev/null +++ b/examples/plano/README.md @@ -0,0 +1,42 @@ +# Finetune Katanemo's Plano-Orchestrator with Axolotl + +[Plano-Orchestrator](https://huggingface.co/collections/katanemo/plano-orchestrator) is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing. + +This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. + +## Getting started + +1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). + +2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. + +3. Run the finetuning example: + + ```bash + axolotl train examples/plano/plano-4b-qlora.yaml + ``` + +This config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 + +### Orchestration Prompt + +Plano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the [official model card](https://huggingface.co/katanemo/Plano-Orchestrator-4B) for proper prompt formatting and the `ORCHESTRATION_PROMPT` template. + +### Tips + +- To use the larger [Plano-Orchestrator-30B-A3B](https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B) MoE model, simply change `base_model: katanemo/Plano-Orchestrator-30B-A3B` in the config and enable multi-GPU training if needed. +- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. +- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). +- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). + +## Optimization Guides + +Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). + +## Related Resources + +- [Plano GitHub](https://github.com/katanemo/plano) +- [Axolotl Docs](https://docs.axolotl.ai) +- [Axolotl Website](https://axolotl.ai) +- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) +- [Axolotl Discord](https://discord.gg/7m9sfhzaf3) diff --git a/examples/plano/plano-4b-qlora.yaml b/examples/plano/plano-4b-qlora.yaml new file mode 100644 index 000000000..106e44205 --- /dev/null +++ b/examples/plano/plano-4b-qlora.yaml @@ -0,0 +1,65 @@ +base_model: katanemo/Plano-Orchestrator-4B + +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +load_in_8bit: false +load_in_4bit: true + +chat_template: qwen3 +datasets: + - path: fozziethebeat/alpaca_messages_2k_test + type: chat_template + +dataset_prepared_path: last_run_prepared +val_set_size: 0.1 +output_dir: ./outputs/lora-out + +adapter: qlora +lora_model_dir: + +sequence_len: 2048 +sample_packing: true + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_linear: true +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 + +# save_first_step: true # uncomment this to validate checkpoint saving works with your config diff --git a/examples/trinity/README.md b/examples/trinity/README.md index 28b2e2b52..4bbfcf29c 100644 --- a/examples/trinity/README.md +++ b/examples/trinity/README.md @@ -29,6 +29,10 @@ Let us know how it goes. Happy finetuning! 🚀 Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). +## Limitations + +**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for Trinity in the near future. + ## Related Resources - [Trinity Blog](https://www.arcee.ai/blog/the-trinity-manifesto) diff --git a/examples/trinity/trinity-nano-preview-qlora.yaml b/examples/trinity/trinity-nano-preview-qlora.yaml index 43263cabd..de54fc8ac 100644 --- a/examples/trinity/trinity-nano-preview-qlora.yaml +++ b/examples/trinity/trinity-nano-preview-qlora.yaml @@ -1,5 +1,6 @@ base_model: arcee-ai/Trinity-Nano-Preview trust_remote_code: true +revision_of_model: 2ee94b0 # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name