diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd index 71d5cc766..54793c6e3 100644 --- a/docs/multimodal.qmd +++ b/docs/multimodal.qmd @@ -19,6 +19,7 @@ format: - [Gemma-3n](#sec-gemma-3n) - [Qwen2-VL](#sec-qwen2-vl) - [Qwen2.5-VL](#sec-qwen25-vl) +- [GLM-4.6V](#sec-glm-4-6v) - [SmolVLM2](#sec-smolvlm2) - [LFM2-VL](#sec-lfm2-vl) - [Intern-VL](#sec-intern-vl) @@ -183,6 +184,18 @@ base_model: Qwen/Qwen3-VL-4B-Instruct chat_template: qwen2_vl # same as qwen2-vl ``` +### GLM-4.6V {#sec-glm-4-6v} + +Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported. + +```yaml +# GLM-4.6V (106B MoE version) +base_model: zai-org/GLM-4.6V + +# OR GLM-4.6V-Flash (9B version) +base_model: zai-org/GLM-4.6V-Flash +``` + ### SmolVLM2 {#sec-smolvlm2} ::: {.callout-tip} diff --git a/examples/glm46v/README.md b/examples/glm46v/README.md new file mode 100644 index 000000000..965e08e51 --- /dev/null +++ b/examples/glm46v/README.md @@ -0,0 +1,44 @@ +# Finetune GLM-4.6V with Axolotl + +GLM-4.6V is a family of vision-language models from ZhipuAI found on [HuggingFace](https://huggingface.co/zai-org/GLM-4.6V). This guide shows how to fine-tune it with Axolotl for vision-language tasks. + + + +## Getting started + +1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build). + +2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage. + + +3. Run the fine-tuning: + + glm-4-6v-flash(9B) + ```bash + axolotl train examples/glm46v/glm-4-6v-flash-qlora.yaml + ``` + +Let us know how it goes. Happy finetuning! 🚀 + +## Tips + +- Vision datasets should follow the format described in the [multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format) +- You can run a **full finetuning** by removing the `adapter: qlora` and `load_in_4bit: true` from the config. +- Read more on how to load your own dataset in the [dataset loading docs](https://docs.axolotl.ai/docs/dataset_loading.html). + +## Supported Models + +- **GLM-4.6V**: Full vision-language model (`zai-org/GLM-4.6V`) +- **GLM-4.6V-Flash**: Faster variant (`zai-org/GLM-4.6V-Flash`) + +## Optimization Guides + +Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html). + +## Related Resources + +- [ZhipuAI GLM-4.6V](https://huggingface.co/zai-org/GLM-4.6V) +- [Axolotl Docs](https://docs.axolotl.ai) +- [Axolotl Website](https://axolotl.ai) +- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl) +- [Axolotl Discord](https://discord.gg/7m9sfhzaf3) diff --git a/examples/glm46v/glm-4-6v-flash-ddp.yaml b/examples/glm46v/glm-4-6v-flash-ddp.yaml new file mode 100644 index 000000000..c67ac5e28 --- /dev/null +++ b/examples/glm46v/glm-4-6v-flash-ddp.yaml @@ -0,0 +1,53 @@ +base_model: zai-org/GLM-4.6V-Flash +trust_remote_code: true + +processor_type: AutoProcessor +load_in_4bit: true + +# these 3 lines are needed for now to handle vision chat templates w images +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false +ddp_find_unused_parameters: true + +output_dir: ./outputs/glm-4-6v-flash-qlora +datasets: + - path: HuggingFaceH4/llava-instruct-mix-vsft + type: chat_template + split: train[:1%] + +adapter: qlora +lora_r: 16 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj + +sequence_len: 2048 + +gradient_accumulation_steps: 4 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +logging_steps: 1 +sdp_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 0 +saves_per_epoch: 1 +weight_decay: 0.0 diff --git a/examples/glm46v/glm-4-6v-flash-qlora.yaml b/examples/glm46v/glm-4-6v-flash-qlora.yaml new file mode 100644 index 000000000..287944ae8 --- /dev/null +++ b/examples/glm46v/glm-4-6v-flash-qlora.yaml @@ -0,0 +1,50 @@ +base_model: zai-org/GLM-4.6V-Flash +trust_remote_code: true + +processor_type: AutoProcessor +load_in_4bit: true + +# these 3 lines are needed for now to handle vision chat templates w images +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false + +output_dir: ./outputs/glm-4-6v-flash-qlora +datasets: + - path: HuggingFaceH4/llava-instruct-mix-vsft + type: chat_template + split: train[:1%] + +adapter: qlora +lora_r: 16 +lora_alpha: 32 +lora_dropout: 0.05 +lora_target_modules: + - gate_proj + - down_proj + - up_proj + - q_proj + - v_proj + - k_proj + - o_proj + +sequence_len: 2048 + +gradient_accumulation_steps: 4 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: false + +gradient_checkpointing: true +logging_steps: 1 +sdp_attention: true + +warmup_ratio: 0.1 +evals_per_epoch: 0 +saves_per_epoch: 1 +weight_decay: 0.0 diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py index 077db4388..c8b153e6d 100644 --- a/src/axolotl/processing_strategies.py +++ b/src/axolotl/processing_strategies.py @@ -485,6 +485,58 @@ class InternVLProcessingStrategy(ProcessingStrategy): return labels +class Glm4vProcessingStrategy(ProcessingStrategy): + """Processing Strategy class for GLM4V and GLM4V-MoE vision models.""" + + def __init__( + self, + processor: ProcessorMixin, + chat_template: Optional[str] = None, + image_size: int | tuple[int, int] | None = None, + image_resize_algorithm: Resampling | None = None, + ): + super().__init__(processor, chat_template, image_size, image_resize_algorithm) + + self.tokenizer = getattr(processor, "tokenizer", processor) + + self.image_token = "<|image|>" # nosec + self.begin_image_token = "<|begin_of_image|>" # nosec + self.end_image_token = "<|end_of_image|>" # nosec + self.video_token = "<|video|>" # nosec + self.begin_video_token = "<|begin_of_video|>" # nosec + self.end_video_token = "<|end_of_video|>" # nosec + + self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token) + self.begin_image_token_id = self.tokenizer.convert_tokens_to_ids( + self.begin_image_token + ) + self.end_image_token_id = self.tokenizer.convert_tokens_to_ids( + self.end_image_token + ) + self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token) + self.begin_video_token_id = self.tokenizer.convert_tokens_to_ids( + self.begin_video_token + ) + self.end_video_token_id = self.tokenizer.convert_tokens_to_ids( + self.end_video_token + ) + + def process_labels(self, input_ids): + labels = input_ids.clone() + + labels[labels == self.tokenizer.pad_token_id] = -100 + + labels[labels == self.image_token_id] = -100 + labels[labels == self.begin_image_token_id] = -100 + labels[labels == self.end_image_token_id] = -100 + + labels[labels == self.video_token_id] = -100 + labels[labels == self.begin_video_token_id] = -100 + labels[labels == self.end_video_token_id] = -100 + + return labels + + def get_processing_strategy( processor: ProcessorMixin, chat_template, @@ -501,10 +553,10 @@ def get_processing_strategy( "image_resize_algorithm": image_resize_algorithm, } - if chat_template_type in [None, "tokenizer_default"] and hasattr( - processor.tokenizer, "chat_template" - ): - processing_kwargs["chat_template"] = processor.tokenizer.chat_template + if chat_template_type in [None, "tokenizer_default"]: + tokenizer = getattr(processor, "tokenizer", processor) + if hasattr(tokenizer, "chat_template"): + processing_kwargs["chat_template"] = tokenizer.chat_template if chat_template_type == "qwen2_vl": return Qwen2VLProcessingStrategy( @@ -533,6 +585,15 @@ def get_processing_strategy( return Mistral3ProcessingStrategy( **processing_kwargs, ) + try: + from transformers.models.glm46v.processing_glm46v import Glm46VProcessor + + if isinstance(processor, Glm46VProcessor): + return Glm4vProcessingStrategy( + **processing_kwargs, + ) + except ImportError: + pass if isinstance(processor, InternVLProcessor): return InternVLProcessingStrategy(