fix: missing key in enum

feat: add custom processing strategy for phi35 vl
feat: add phi_35_vl support
2025-07-03 13:46:16 +08:00 · 2025-07-03 13:46:16 +08:00 · 2025-07-03 13:46:16 +08:00
15 changed files with 156 additions and 46 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,13 +5,11 @@ on:
    branches:
      - "main"
    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
+      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
+      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:

--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,9 +18,96 @@ jobs:
        env:
          SKIP: no-commit-to-branch

+  preload-cache:
+    name: Preload HF cache
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
+    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
@@ -33,11 +120,14 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -78,6 +168,10 @@ jobs:
        run: |
          axolotl --help

+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -99,8 +193,15 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -22,11 +22,9 @@ RUN apt-get update \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
-    && conda init bash \
-    && echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
+    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

-ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

@@ -39,7 +37,3 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
-
-RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
-        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
-    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -22,11 +22,9 @@ RUN apt-get update \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
-    && conda init bash \
-    && echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
+    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

-ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -22,11 +22,9 @@ RUN apt-get update \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "axolotl-py${PYTHON_VERSION}" python="${PYTHON_VERSION}" \
-    && conda init bash \
-    && echo "conda activate axolotl-py${PYTHON_VERSION}" >> ~/.bashrc
+    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

-ENV PATH="/root/miniconda3/envs/axolotl-py${PYTHON_VERSION}/bin:${PATH}"
+ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -51,10 +51,6 @@ description: Frequently asked questions
 >   pad_token: "..."
 > ```

-**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
-
-> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
-
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -16,6 +16,7 @@ format:
 - [Gemma-3](#sec-gemma-3)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
+- [Phi3-V](#sec-phi3-v)

 ## Usage

@@ -126,6 +127,15 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```

+### Phi3-V {#sec-phi3-v}
+
+```yaml
+base_model: microsoft/Phi-3.5-vision-instruct
+
+trust_remote_code: true
+chat_template: phi_35_vl
+```
+
 ## Dataset Format

 For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -35,12 +35,6 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
    check_accelerate_default_config()
    check_user_token()

-    for key in ["skip_prepare_dataset", "pretraining_dataset"]:
-        if cfg.get("key"):
-            raise ValueError(
-                f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
-            )
-
    if not cfg.dataset_prepared_path:
        msg = (
            Fore.RED
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -219,9 +219,7 @@ class TrainerBuilderBase(abc.ABC):
        if self.cfg.bf16 == "full":
            training_args_kwargs["bf16_full_eval"] = True
        else:
-            bf16 = self.cfg.bf16 or self.cfg.bfloat16
-            bf16 = bf16 if bf16 is not None else False
-            training_args_kwargs["bf16"] = bf16
+            training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16

    def _configure_scheduler(self, training_args_kwargs: dict):
        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,6 +1,7 @@
 """Shared constants for axolotl.loaders module"""

 from transformers import (
+    AutoModelForCausalLM,
    Gemma3ForConditionalGeneration,
    Llama4ForConditionalGeneration,
    LlavaForConditionalGeneration,
@@ -18,4 +19,6 @@ MULTIMODAL_AUTO_MODEL_MAPPING = {
    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
    "mistral3": Mistral3ForConditionalGeneration,
    "gemma3": Gemma3ForConditionalGeneration,
+    # phi3_v modeling code is not available in transformers yet
+    "phi3_v": AutoModelForCausalLM,
 }
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -264,6 +264,23 @@ class Gemma3ProcessingStrategy(ProcessingStrategy):
        return labels


+class Phi35VLProcessingStrategy(ProcessingStrategy):
+    """Processing Strategy class for Phi-3.5-vision-instruct"""
+
+    def __init__(
+        self,
+        processor: ProcessorMixin,
+        chat_template: Optional[str] = None,
+        image_size: int | tuple[int, int] | None = None,
+        image_resize_algorithm: Resampling | None = None,
+    ):
+        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
+        self.image_token = "<|image|>"  # nosec
+        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
+            self.image_token
+        )
+
+
 def get_processing_strategy(
    processor: ProcessorMixin,
    chat_template,
@@ -279,6 +296,10 @@ def get_processing_strategy(
        return Gemma3ProcessingStrategy(
            processor, chat_template, image_size, image_resize_algorithm
        )
+    if chat_template_type == "phi_35_vl":
+        return Phi35VLProcessingStrategy(
+            processor, chat_template, image_size, image_resize_algorithm
+        )
    if chat_template_type in [
        "llama3_2_vision",
        "llama4",
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -32,6 +32,7 @@ _CHAT_TEMPLATES = {
    "llava": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
    "phi_3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
    "phi_35": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
+    "phi_35_vl": "{% set image_count = namespace(value=0) %}{% for message in messages %}{{'<|' + message['role'] + '|>\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% set message_images = [] %}{% set message_text = [] %}{% for chunk in message['content'] %}{% if chunk['type'] == 'image' or 'image' in chunk or 'image_url' in chunk %}{% set image_count.value = image_count.value + 1 %}{% set _ = message_images.append('<|image_' + image_count.value|string + '|>\n') %}{% elif chunk['type'] == 'text' %}{% set _ = message_text.append(chunk['text']) %}{% endif %}{% endfor %}{{ message_images | join('') }}{{ message_text | join('') }}{% endif %}{{ '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
    "phi_4": "{% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
    "deepseek_v2": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}",
    "deepseek_v3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -526,9 +526,8 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
    if len(datasets) == 1:
        ds = datasets[0]

-        # Do not shuffle if curriculum sampling is enabled or
-        # shuffle_merged_datasets is disabled
-        if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
+        # Do not shuffle if curriculum sampling is enabled
+        if cfg.curriculum_sampling:
            return ds

        return ds.shuffle(seed=cfg.seed)
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -48,6 +48,8 @@ class ChatTemplate(str, Enum):
    llama4 = "llama4"
    phi_3 = "phi_3"
    phi_35 = "phi_35"
+    phi_35_vl = "phi_35_vl"
+    phi_4 = "phi_4"
    deepseek_v2 = "deepseek_v2"
    deepseek_v3 = "deepseek_v3"
    jamba = "jamba"
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -609,9 +609,6 @@ def prepare_opinionated_env(cfg):
    if cfg.qlora_sharded_model_loading:
        # model loading is forked after the tokenizer
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    if cfg.sample_packing:
-        # multipack parallel packing sampler defaults to using fork
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"


 def setup_trainer(
Author	SHA1	Message	Date
NanoCode012	1f2f285173	fix: missing key in enum	2025-07-03 13:46:16 +08:00
NanoCode012	98e912e416	feat: add custom processing strategy for phi35 vl	2025-07-03 13:46:16 +08:00
NanoCode012	e1528fb381	feat: add phi_35_vl support	2025-07-03 13:46:16 +08:00