From 243620394a2576db507b1f6ab033c4183a18233e Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 23 Oct 2025 05:23:20 +0700 Subject: [PATCH] fix: force train split for json,csv,txt for test_datasets and misc doc changes (#3226) * fix: force train split for json,csv,txt for test_datasets * feat(doc): add info on mixing datasets for VLM * feat(doc): max memory * fix(doc): clarify lr groups * fix: add info on vision not being dropped * feat: add qwen3-vl to multimodal docs * fix: add moe blocks to arch list * feat(doc): improve mistral docs * chore: add helpful link [skip-e2e] * fix: add vram usage for mistral small * Update link in docs/faq.qmd Co-authored-by: salman --------- Co-authored-by: Wing Lian Co-authored-by: salman --- docs/faq.qmd | 8 +++ docs/lr_groups.qmd | 6 +++ docs/multimodal.qmd | 14 ++++- examples/magistral/think/README.md | 2 +- examples/magistral/vision/README.md | 2 +- examples/mistral/mistral-small/README.md | 51 +++++++++++++++++++ .../mistral-small-3.1-24B-lora.yml | 2 +- src/axolotl/common/architectures.py | 2 + src/axolotl/utils/data/shared.py | 5 ++ 9 files changed, 88 insertions(+), 4 deletions(-) create mode 100644 examples/mistral/mistral-small/README.md diff --git a/docs/faq.qmd b/docs/faq.qmd index ffc29d35d..92b432f2d 100644 --- a/docs/faq.qmd +++ b/docs/faq.qmd @@ -63,6 +63,14 @@ description: Frequently asked questions > A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717. +**Q: Can we mix text and text+image datasets for VLM training?** + +> A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know! + +**Q: Why is `memory/max_*` different from `nvidia-smi`?** + +> A: We use `torch` APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information. + ### Chat templates **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`** diff --git a/docs/lr_groups.qmd b/docs/lr_groups.qmd index 52059016c..ce5350722 100644 --- a/docs/lr_groups.qmd +++ b/docs/lr_groups.qmd @@ -27,3 +27,9 @@ learning_rate: 2e-5 In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's self attention `q_proj` module. + +::: {.callout-note} + +We currently only support varying `lr` for now. If you're interested in adding support for others (`weight_decay`), we welcome PRs. See https://github.com/axolotl-ai-cloud/axolotl/blob/613bcf90e58f3ab81d3827e7fc572319908db9fb/src/axolotl/core/trainers/mixins/optimizer.py#L17 + +::: diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd index 3a28b579a..1c4e28ea7 100644 --- a/docs/multimodal.qmd +++ b/docs/multimodal.qmd @@ -56,10 +56,14 @@ image_resize_algorithm: bilinear Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs. -::: {.callout-warning} +::: {.callout-tip} Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs. ::: +::: {.callout-note} +As of now, we do not truncate nor drop samples based on `sequence_len` as each arch has different ways to process non-text tokens. We are looking for help on this. +::: + ### Mllama {#sec-mllama} ```yaml @@ -168,6 +172,14 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct chat_template: qwen2_vl # same as qwen2-vl ``` +### Qwen3-VL {#sec-qwen3-vl} + +```yaml +base_model: Qwen/Qwen3-VL-4B-Instruct + +chat_template: qwen2_vl # same as qwen2-vl +``` + ### SmolVLM2 {#sec-smolvlm2} ::: {.callout-tip} diff --git a/examples/magistral/think/README.md b/examples/magistral/think/README.md index 29950f59e..a87579775 100644 --- a/examples/magistral/think/README.md +++ b/examples/magistral/think/README.md @@ -12,7 +12,7 @@ Before starting, ensure you have: Run the thinking model fine-tuning: ```bash -axolotl train magistral-small-think-qlora.yaml +axolotl train examples/magistral/think/magistral-small-think-qlora.yaml ``` This config uses about 19.1 GiB VRAM. diff --git a/examples/magistral/vision/README.md b/examples/magistral/vision/README.md index 932a3631e..fc614c850 100644 --- a/examples/magistral/vision/README.md +++ b/examples/magistral/vision/README.md @@ -21,7 +21,7 @@ Before starting, ensure you have: 3. Run the fine-tuning: ```bash - axolotl train magistral-small-vision-24B-qlora.yml + axolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml ``` This config uses about 17GiB VRAM. diff --git a/examples/mistral/mistral-small/README.md b/examples/mistral/mistral-small/README.md new file mode 100644 index 000000000..3c606a897 --- /dev/null +++ b/examples/mistral/mistral-small/README.md @@ -0,0 +1,51 @@ +# Mistral Small 3.1/3.2 Fine-tuning + +This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24B-Instruct-2503) and [Mistral Small 3.2](mistralai/Mistral-Small-3.2-24B-Instruct-2506) with vision capabilities using Axolotl. + +## Prerequisites + +Before starting, ensure you have: +- Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html)) + +## Getting Started + +1. Install the required vision lib: + ```bash + pip install 'mistral-common[opencv]==1.8.5' + ``` + +2. Download the example dataset image: + ```bash + wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg + ``` + +3. Run the fine-tuning: + ```bash + axolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml + ``` + +This config uses about 29.4 GiB VRAM. + +## Dataset Format + +The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). + +One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now. + +Example: +```json +{ + "messages": [ + {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]}, + {"role": "user", "content": [ + { "type": "text", "text": "What's in this image?"}, + {"type": "image", "path": "path/to/image.jpg" } + ]}, + {"role": "assistant", "content": [{ "type": "text", "text": "..." }]}, + ], +} +``` + +## Limitations + +- Sample Packing is not supported for multi-modality training currently. diff --git a/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml index ec197f333..d45d13ac6 100644 --- a/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml +++ b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml @@ -39,7 +39,7 @@ wandb_name: wandb_log_model: gradient_accumulation_steps: 1 -micro_batch_size: 1 +micro_batch_size: 2 num_epochs: 1 optimizer: adamw_bnb_8bit lr_scheduler: cosine diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py index b754e56ba..c8a2f0836 100644 --- a/src/axolotl/common/architectures.py +++ b/src/axolotl/common/architectures.py @@ -12,7 +12,9 @@ MOE_ARCH_BLOCK = { "mixtral": "MixtralSparseMoeBlock", "qwen2_moe": "Qwen2MoeSparseMoeBlock", "qwen3_moe": "Qwen3MoeSparseMoeBlock", + "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock", "deepseek_v2": "DeepseekV2MoE", + "deepseek_v3": "DeepseekV3MoE", "gpt_oss": "GptOssDecoderLayer", "lfm2_moe": "Lfm2MoeSparseMoeBlock", } diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py index c9a91b829..a8ed55ae2 100644 --- a/src/axolotl/utils/data/shared.py +++ b/src/axolotl/utils/data/shared.py @@ -239,6 +239,11 @@ def _load_from_local_path( return load_dataset(dataset_config.path, **load_dataset_kwargs) elif local_path.is_file(): dataset_type = get_dataset_type(dataset_config) + + # For single file datasets, HF always creates only a "train" split + if dataset_type in ("json", "csv", "text"): + load_dataset_kwargs["split"] = "train" + return load_dataset( dataset_type, data_files=dataset_config.path,