diff --git a/README.md b/README.md index 13a5a9243..65cd5eaa6 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ ## 🎉 Latest Updates -- 2025/12: Axolotl now includes support for [Kimi-Linear](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/kimi-linear), [Olmo3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/olmo3), [Trinity](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/trinity), and [Ministral3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/ministral3). +- 2025/12: Axolotl now includes support for [Kimi-Linear](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/kimi-linear), [InternVL 3.5](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/internvl3_5), [Olmo3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/olmo3), [Trinity](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/trinity), and [Ministral3](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/ministral3). - 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/qwen3-next), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3), [Granite 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/granite4), [HunYuan](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/hunyuan), [Magistral 2509](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral#vision), [Apertus](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/apertus), and [Seed-OSS](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/seed-oss). - 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion). - 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107). diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd index e63a553b2..71d5cc766 100644 --- a/docs/multimodal.qmd +++ b/docs/multimodal.qmd @@ -21,6 +21,7 @@ format: - [Qwen2.5-VL](#sec-qwen25-vl) - [SmolVLM2](#sec-smolvlm2) - [LFM2-VL](#sec-lfm2-vl) +- [Intern-VL](#sec-intern-vl) ## Usage @@ -202,6 +203,16 @@ Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d` base_model: LiquidAI/LFM2-VL-450M ``` +### Intern-VL {#sec-intern-vl} + +::: {.callout-tip} +Please make sure to install `timm` via `pip3 install timm==1.0.19` +::: + +```yaml +base_model: OpenGVLab/InternVL3_5-8B +``` + ## Dataset Format For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format. diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb index 133c3db79..be4f3f4a9 100644 --- a/examples/colab-notebooks/colab-axolotl-example.ipynb +++ b/examples/colab-notebooks/colab-axolotl-example.ipynb @@ -40,7 +40,7 @@ "%%capture\n", "# This step can take ~5-10 minutes to install dependencies\n", "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n", - "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@242b245\"" + "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2\"" ] }, { diff --git a/examples/internvl3_5/README.md b/examples/internvl3_5/README.md new file mode 100644 index 000000000..d2584bb80 --- /dev/null +++ b/examples/internvl3_5/README.md @@ -0,0 +1,43 @@ +# Finetune OpenGV's InternVL with Axolotl + +[InternVL 3.5](https://huggingface.co/OpenGVLab/InternVL3_5-8B-HF) is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding. + +This guide shows how to fine-tune it with Axolotl. + +## Getting started + +1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). + +2. Install `timm` for vision model support: + + ```bash + pip install timm==1.0.19 + ``` + +3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. + +4. Run the finetuning example: + + ```bash + axolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml + ``` + +This config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀 + +### Tips + +- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. +- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). +- The dataset format follows the multi-modal format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format). + +## Optimization Guides + +Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). + +## Related Resources + +- [InternVL Paper](https://huggingface.co/papers/2508.18265) +- [Axolotl Docs](https://docs.axolotl.ai) +- [Axolotl Website](https://axolotl.ai) +- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) +- [Axolotl Discord](https://discord.gg/7m9sfhzaf3) diff --git a/examples/internvl3_5/internvl3_5-8b-qlora.yml b/examples/internvl3_5/internvl3_5-8b-qlora.yml new file mode 100644 index 000000000..9a72d078a --- /dev/null +++ b/examples/internvl3_5/internvl3_5-8b-qlora.yml @@ -0,0 +1,61 @@ +base_model: OpenGVLab/InternVL3_5-8B-HF +processor_type: AutoProcessor + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +load_in_4bit: true + +# these 3 lines are needed for now to handle vision chat templates w images +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false + +datasets: + - path: HuggingFaceH4/llava-instruct-mix-vsft + type: chat_template + split: train[:1%] + field_messages: messages + +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./outputs/out + +adapter: qlora +lora_model_dir: + +sequence_len: 2048 + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj' + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: true +fp16: +tf32: true + +gradient_checkpointing: true +logging_steps: 1 +flash_attention: true +eager_attention: + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 + +# save_first_step: true # uncomment this to validate checkpoint saving works with your config diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py index e902bb0ac..59e55c074 100644 --- a/scripts/cutcrossentropy_install.py +++ b/scripts/cutcrossentropy_install.py @@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else "" print( UNINSTALL_PREFIX - + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@242b245"' + + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"' ) diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md index b28382542..771f446b0 100644 --- a/src/axolotl/integrations/cut_cross_entropy/README.md +++ b/src/axolotl/integrations/cut_cross_entropy/README.md @@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh - If you are installing from pip ```bash -pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@242b245" +pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2" ``` ## Usage @@ -54,6 +54,7 @@ plugins: - granitemoehybrid - hunyuan_v1_dense - hunyuan_v1_moe +- internvl - kimi_linear - lfm2 - lfm2_moe diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py index 0d1588f99..3c059da4c 100644 --- a/src/axolotl/integrations/cut_cross_entropy/__init__.py +++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py @@ -35,7 +35,7 @@ LOG = get_logger(__name__) _CCE_INSTALL_MESSAGE = ( "Please install Axolotl's fork of cut_cross_entropy with transformers support using " - '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@242b245"`' + '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"`' ) diff --git a/src/axolotl/loaders/utils.py b/src/axolotl/loaders/utils.py index 240e00da7..b1902c9b5 100644 --- a/src/axolotl/loaders/utils.py +++ b/src/axolotl/loaders/utils.py @@ -79,7 +79,11 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig): and hasattr(model_config, "vision_config") and hasattr(model_config.vision_config, "image_size") ): - cfg.image_size = model_config.vision_config.image_size + image_size = model_config.vision_config.image_size + if isinstance(image_size, list): + cfg.image_size = tuple(image_size) + else: + cfg.image_size = image_size LOG.debug(f"Loaded image size: {cfg.image_size} from model config") quant_config_exists = ( diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py index 07b114163..c209c892a 100644 --- a/src/axolotl/processing_strategies.py +++ b/src/axolotl/processing_strategies.py @@ -8,6 +8,7 @@ from PIL.Image import Resampling from torch import Tensor, zeros_like from transformers import ProcessorMixin from transformers.image_utils import load_image +from transformers.models.internvl import InternVLProcessor from transformers.models.smolvlm import SmolVLMProcessor from transformers.models.voxtral import VoxtralProcessor @@ -454,6 +455,37 @@ class Mistral3ProcessingStrategy(ProcessingStrategy): return labels +class InternVLProcessingStrategy(ProcessingStrategy): + """Processing Strategy class for InternVL""" + + def __init__( + self, + processor: ProcessorMixin, + chat_template: Optional[str] = None, + image_size: int | tuple[int, int] | None = None, + image_resize_algorithm: Resampling | None = None, + ): + super().__init__(processor, chat_template, image_size, image_resize_algorithm) + + if not hasattr(processor, "image_ids"): + raise ValueError("'image_ids' missing from InternVL Processor.") + + self.image_token_ids = processor.image_ids + + def process_labels(self, input_ids): + labels = input_ids.clone() + + labels[labels == self.processor.tokenizer.pad_token_id] = -100 + + for ids in self.image_token_ids: + labels[labels == ids] = -100 + + # Note: Check if need to mask 'video_token' as it gets converted to + # image patches during media processing + + return labels + + def get_processing_strategy( processor: ProcessorMixin, chat_template, @@ -501,6 +533,11 @@ def get_processing_strategy( **processing_kwargs, ) + if isinstance(processor, InternVLProcessor): + return InternVLProcessingStrategy( + **processing_kwargs, + ) + # llama3_2_vision, llama4, llava # mistral_v7_tekken, pixtral, lfm2vl return ProcessingStrategy(