feat: add llama4 multimodal (#2499)

* feat: add llama4 multimodal * feat: add torchvision to base docker * just use latest torchvision --------- Co-authored-by: Wing Lian <wing@axolotl.ai>
2025-04-07 21:49:29 +07:00
parent 8bbad21bfd
commit e0e5d9b1d6
6 changed files with 15 additions and 1 deletions
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -9,6 +9,7 @@ format:
 ## Supported Models

 - [Mllama](#sec-mllama)
+- [Llama4](#sec-llama4)
 - [Pixtral](#sec-pixtral)
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
@@ -63,6 +64,14 @@ base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
 chat_template: llama3_2_vision
 ```

+### Llama4 {#sec-llama4}
+
+```yaml
+base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
+
+chat_template: llama4
+```
+
 ### Pixtral {#sec-pixtral}

 ```yaml
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -268,6 +268,7 @@ def get_processing_strategy(
        )
    if chat_template_type in [
        "llama3_2_vision",
+        "llama4",
        "llava",
        "mistral_v7_tekken",
        "pixtral",
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -36,6 +36,7 @@ from transformers import (
    BitsAndBytesConfig,
    Gemma3ForConditionalGeneration,
    GPTQConfig,
+    Llama4ForConditionalGeneration,
    LlavaForConditionalGeneration,
    Mistral3ForConditionalGeneration,
    MllamaForConditionalGeneration,
@@ -76,6 +77,7 @@ LOG = logging.getLogger(__name__)

 MULTIMODAL_AUTO_MODEL_MAPPING = {
    "mllama": MllamaForConditionalGeneration,
+    "llama4": Llama4ForConditionalGeneration,
    "llava": LlavaForConditionalGeneration,
    "qwen2_vl": Qwen2VLForConditionalGeneration,
    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -28,6 +28,7 @@ class ChatTemplate(str, Enum):
    llama3 = "llama3"  # pylint: disable=invalid-name
    llama4 = "llama4"  # pylint: disable=invalid-name
    llama3_2_vision = "llama3_2_vision"  # pylint: disable=invalid-name
+    llama4 = "llama4"  # pylint: disable=invalid-name
    phi_3 = "phi_3"  # pylint: disable=invalid-name
    phi_35 = "phi_35"  # pylint: disable=invalid-name
    deepseek_v2 = "deepseek_v2"  # pylint: disable=invalid-name