diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd index 1c4e28ea7..e63a553b2 100644 --- a/docs/multimodal.qmd +++ b/docs/multimodal.qmd @@ -124,6 +124,8 @@ Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral ```yaml base_model: mistralai/Voxtral-Mini-3B-2507 + +processor_type: VoxtralProcessor ``` ### Gemma-3 {#sec-gemma-3} diff --git a/examples/voxtral/voxtral-mini-audio-qlora.yml b/examples/voxtral/voxtral-mini-audio-qlora.yml index 8fe6adbff..59150c4ca 100644 --- a/examples/voxtral/voxtral-mini-audio-qlora.yml +++ b/examples/voxtral/voxtral-mini-audio-qlora.yml @@ -1,5 +1,5 @@ base_model: mistralai/Voxtral-Mini-3B-2507 -processor_type: AutoProcessor +processor_type: VoxtralProcessor # Automatically upload checkpoint and final model to HF # hub_model_id: username/custom_model_name diff --git a/src/axolotl/loaders/processor.py b/src/axolotl/loaders/processor.py index 7580b2008..b35ea00fd 100644 --- a/src/axolotl/loaders/processor.py +++ b/src/axolotl/loaders/processor.py @@ -1,7 +1,5 @@ """Processor loading functionality for multi-modal models""" -from typing import Any - import transformers from transformers import ( AutoProcessor, @@ -15,13 +13,33 @@ LOG = get_logger(__name__) def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase): - processor_kwargs: dict[str, Any] = {} # Do we actually need this? - processor_cls = AutoProcessor if cfg.processor_type: processor_cls = getattr(transformers, cfg.processor_type) if cfg.tokenizer_use_mistral_common: + + def _patch_mistralcommontokenizer(): + """ + Transformers v5 stops reading the sub-processor. + + We need to patch this, so both processors use this. + """ + import transformers.tokenization_mistral_common as tokenization_mistral_common + + from axolotl.utils.mistral import HFMistralTokenizer + + tokenization_mistral_common.MistralCommonTokenizer = HFMistralTokenizer + + _patch_mistralcommontokenizer() + + from transformers import VoxtralProcessor + + if processor_cls == VoxtralProcessor: + return VoxtralProcessor.from_pretrained( + cfg.processor_config, + ) + from axolotl.utils.mistral import Mistral3Processor return Mistral3Processor( @@ -32,7 +50,6 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase): cfg.processor_config, trust_remote_code=cfg.trust_remote_code or False, tokenizer=tokenizer, - **processor_kwargs, ) # Attempt to load image size from processor if available diff --git a/src/axolotl/utils/mistral/mistral3_processor.py b/src/axolotl/utils/mistral/mistral3_processor.py index 85479ca7b..01e8f9f10 100644 --- a/src/axolotl/utils/mistral/mistral3_processor.py +++ b/src/axolotl/utils/mistral/mistral3_processor.py @@ -30,6 +30,7 @@ class Mistral3Processor(ProcessorMixin): Wraps HFMistralTokenizer and adds image processing capabilities. """ + # TODO(nano): This should be removed in transformers V5 attributes = ["tokenizer"] tokenizer_class = "HFMistralTokenizer"