From ebbd7fa8473c2412cfd78b4f6cc0233e5d907970 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Wed, 29 Apr 2026 22:46:51 +0700 Subject: [PATCH] feat: Add Mistral Medium 3.5 (#3633) * fix: clarify incompat * fix: transformers api change upstream * fix: add pre prop * feat: add examples * chore: cleanup * chore: update readme --- README.md | 3 + docs/attention.qmd | 4 +- docs/scripts/examples-allowlist.yml | 2 + examples/devstral/devstral-small-qlora.yml | 3 +- examples/ministral3/ministral3-3b-qlora.yaml | 2 +- examples/mistral-medium-3_5/README.md | 78 ++++++++++++++++++++ examples/mistral-medium-3_5/qlora-text.yml | 56 ++++++++++++++ examples/mistral-medium-3_5/qlora-vision.yml | 61 +++++++++++++++ src/axolotl/utils/collators/mm_chat.py | 8 +- 9 files changed, 210 insertions(+), 7 deletions(-) create mode 100644 examples/mistral-medium-3_5/README.md create mode 100644 examples/mistral-medium-3_5/qlora-text.yml create mode 100644 examples/mistral-medium-3_5/qlora-vision.yml diff --git a/README.md b/README.md index 73e73d6ae..7e8d3bf48 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,9 @@ ## 🎉 Latest Updates +- 2026/04: + - New model support has been added in Axolotl for [Mistral Medium 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral-medium-3_5) and [Gemma 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/gemma4). + - Axolotl is now [uv-first](https://github.com/axolotl-ai-cloud/axolotl/pull/3545) and has [SonicMoE fused LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3519) support. - 2026/03: - New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45). - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat). diff --git a/docs/attention.qmd b/docs/attention.qmd index 771299a29..b9644e074 100644 --- a/docs/attention.qmd +++ b/docs/attention.qmd @@ -54,8 +54,10 @@ python setup.py install Requirements: Hopper or Blackwell GPUs +FA4 is still a pre-release on PyPI, so `--pre` is required: + ```bash -pip install flash-attn-4 +pip install --pre flash-attn-4 ``` Or from source: diff --git a/docs/scripts/examples-allowlist.yml b/docs/scripts/examples-allowlist.yml index 50acaea8e..f2fe88e08 100644 --- a/docs/scripts/examples-allowlist.yml +++ b/docs/scripts/examples-allowlist.yml @@ -20,6 +20,8 @@ examples: title: Arcee AFM # MistralAI + - name: mistral-medium-3_5 + title: Mistral Medium 3.5 - name: ministral3/think title: Ministral 3 Thinking - name: ministral3/vision diff --git a/examples/devstral/devstral-small-qlora.yml b/examples/devstral/devstral-small-qlora.yml index ca8e8e043..3eafb9219 100644 --- a/examples/devstral/devstral-small-qlora.yml +++ b/examples/devstral/devstral-small-qlora.yml @@ -26,7 +26,6 @@ lora_model_dir: sequence_len: 2048 sample_packing: true - lora_r: 32 lora_alpha: 16 lora_dropout: 0 @@ -52,7 +51,7 @@ gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true -scaling_softmax: true +# scaling_softmax: true # needs flex_attention loss_watchdog_threshold: 5.0 loss_watchdog_patience: 3 diff --git a/examples/ministral3/ministral3-3b-qlora.yaml b/examples/ministral3/ministral3-3b-qlora.yaml index b369c9d41..4efe5bd2f 100644 --- a/examples/ministral3/ministral3-3b-qlora.yaml +++ b/examples/ministral3/ministral3-3b-qlora.yaml @@ -59,7 +59,7 @@ gradient_checkpointing: true resume_from_checkpoint: logging_steps: 1 flash_attention: true -scaling_softmax: true +# scaling_softmax: true # needs flex_attention warmup_ratio: 0.1 evals_per_epoch: 1 diff --git a/examples/mistral-medium-3_5/README.md b/examples/mistral-medium-3_5/README.md new file mode 100644 index 000000000..f90397a03 --- /dev/null +++ b/examples/mistral-medium-3_5/README.md @@ -0,0 +1,78 @@ +# Finetune Mistral Medium 3.5 with Axolotl + +[Mistral Medium 3.5](https://huggingface.co/mistralai/Mistral-Medium-3.5-128B) is a 128B parameter dense multimodal model from MistralAI that unifies instruct, reasoning, and agentic capabilities into a single model. +It shares the `mistral3` architecture (dense, YaRN RoPE, 256k context) with Ministral 3 and supports the same `reasoning_effort` toggle as Mistral Small 4. + +Thanks to the team at MistralAI for giving us early access to prepare for this release. + +## Getting started + +1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). + +2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. + +3. (Text config only) Install [Flash Attention 4](https://docs.axolotl.ai/docs/attention.html#flash-attention-4) on Hopper/Blackwell. + +4. Run one of the example configs: + + ```bash + # text-only + axolotl train examples/mistral-medium-3_5/qlora-text.yml # ~83.1 GiB + + # text + vision + # wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg + axolotl train examples/mistral-medium-3_5/qlora-vision.yml # ~80.3 GiB + ``` + +Note: vision training does not currently work with Flash Attention 4. + +## Reasoning Effort + +The chat template supports a `reasoning_effort` variable to control the model's reasoning depth: + +- `"none"` — instruct mode (default) +- `"high"` — reasoning mode with explicit thinking steps + +Pass it via `chat_template_kwargs` under your dataset config: + +```yaml +datasets: + - path: your/dataset + type: chat_template + chat_template_kwargs: + reasoning_effort: high +``` + +## Thinking Support + +The chat template supports a `thinking` content type in assistant messages for training on reasoning traces (rendered as `[THINK]...[/THINK]` blocks). + +To use thinking datasets, add the `thinking` mapping via `message_property_mappings`: + +```yaml +datasets: + - path: your/thinking-dataset + type: chat_template + message_property_mappings: + role: role + content: content + thinking: thinking + chat_template_kwargs: + reasoning_effort: high +``` + +See the [Magistral thinking guide](../magistral/think/README.md) for dataset format details. + +## Tips + +- For smaller experiments on the same architecture, see [`examples/ministral3`](../ministral3/README.md) (Ministral 3, 3B). +- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). +- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). +- The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). + +## Related Resources + +- [Mistral Medium 3.5 Blog](https://mistral.ai/news/vibe-remote-agents-mistral-medium-3-5) +- [Axolotl Docs](https://docs.axolotl.ai) +- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) +- [Axolotl Discord](https://discord.gg/7m9sfhzaf3) diff --git a/examples/mistral-medium-3_5/qlora-text.yml b/examples/mistral-medium-3_5/qlora-text.yml new file mode 100644 index 000000000..2ff5b1af3 --- /dev/null +++ b/examples/mistral-medium-3_5/qlora-text.yml @@ -0,0 +1,56 @@ +base_model: axolotl-ai-co/Mistral-Medium-3.5-128B-BF16 + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +load_in_4bit: true + +datasets: + - path: fozziethebeat/alpaca_messages_2k_test + type: chat_template + +dataset_prepared_path: last_run_prepared +val_set_size: 0.1 +output_dir: + +adapter: qlora +lora_model_dir: + +sequence_len: 2048 +sample_packing: true + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0 +# prevents targeting vision layers +lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' + +lora_mlp_kernel: true +lora_qkv_kernel: true +lora_o_kernel: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 1 +max_steps: 10 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 diff --git a/examples/mistral-medium-3_5/qlora-vision.yml b/examples/mistral-medium-3_5/qlora-vision.yml new file mode 100644 index 000000000..a8a51116b --- /dev/null +++ b/examples/mistral-medium-3_5/qlora-vision.yml @@ -0,0 +1,61 @@ +base_model: axolotl-ai-co/Mistral-Medium-3.5-128B-BF16 +processor_type: AutoProcessor + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +load_in_4bit: true + +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false + +# sample dataset below requires downloading image in advance +# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg +datasets: + - path: Nanobit/text-vision-2k-test + type: chat_template + +dataset_prepared_path: last_run_prepared +val_set_size: 0.1 +output_dir: + +adapter: qlora +lora_model_dir: + +sequence_len: 2048 + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0 +lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' + +lora_mlp_kernel: true +lora_qkv_kernel: true +lora_o_kernel: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 1 +max_steps: 10 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +gradient_checkpointing: true +resume_from_checkpoint: +logging_steps: 1 +flash_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 diff --git a/src/axolotl/utils/collators/mm_chat.py b/src/axolotl/utils/collators/mm_chat.py index 542918527..b81612cbc 100644 --- a/src/axolotl/utils/collators/mm_chat.py +++ b/src/axolotl/utils/collators/mm_chat.py @@ -47,10 +47,12 @@ class MultiModalChatDataCollator(DataCollatorMixin): messages, add_generation_prompt=False, tokenize=True, - return_tensors="pt", - padding=True, - return_dict=True, chat_template=self.processing_strategy.chat_template, + processor_kwargs={ + "return_tensors": "pt", + "padding": True, + "return_dict": True, + }, ) # Process the labels