fix: missing key in enum

feat: add custom processing strategy for phi35 vl
feat: add phi_35_vl support
2025-07-03 13:46:16 +08:00 · 2025-07-03 13:46:16 +08:00 · 2025-07-03 13:46:16 +08:00
11 changed files with 433 additions and 309 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,13 +5,11 @@ on:
    branches:
      - "main"
    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
+      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
+      - 'Dockerfile-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:

--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,9 +18,96 @@ jobs:
        env:
          SKIP: no-commit-to-branch

+  preload-cache:
+    name: Preload HF cache
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
+    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
@@ -33,11 +120,14 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -78,6 +168,10 @@ jobs:
        run: |
          axolotl --help

+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -99,8 +193,15 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.5.1
+            num_gpus: 1
+            axolotl_extras:
+            nightly_build: "true"
+          - cuda: 124
+            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,7 +37,3 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
-
-RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
-        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
-    fi
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -16,6 +16,7 @@ format:
 - [Gemma-3](#sec-gemma-3)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
+- [Phi3-V](#sec-phi3-v)

 ## Usage

@@ -126,6 +127,15 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```

+### Phi3-V {#sec-phi3-v}
+
+```yaml
+base_model: microsoft/Phi-3.5-vision-instruct
+
+trust_remote_code: true
+chat_template: phi_35_vl
+```
+
 ## Dataset Format

 For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -219,9 +219,7 @@ class TrainerBuilderBase(abc.ABC):
        if self.cfg.bf16 == "full":
            training_args_kwargs["bf16_full_eval"] = True
        else:
-            bf16 = self.cfg.bf16 or self.cfg.bfloat16
-            bf16 = bf16 if bf16 is not None else False
-            training_args_kwargs["bf16"] = bf16
+            training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16

    def _configure_scheduler(self, training_args_kwargs: dict):
        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,6 +1,7 @@
 """Shared constants for axolotl.loaders module"""

 from transformers import (
+    AutoModelForCausalLM,
    Gemma3ForConditionalGeneration,
    Llama4ForConditionalGeneration,
    LlavaForConditionalGeneration,
@@ -18,4 +19,6 @@ MULTIMODAL_AUTO_MODEL_MAPPING = {
    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
    "mistral3": Mistral3ForConditionalGeneration,
    "gemma3": Gemma3ForConditionalGeneration,
+    # phi3_v modeling code is not available in transformers yet
+    "phi3_v": AutoModelForCausalLM,
 }
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -264,6 +264,23 @@ class Gemma3ProcessingStrategy(ProcessingStrategy):
        return labels


+class Phi35VLProcessingStrategy(ProcessingStrategy):
+    """Processing Strategy class for Phi-3.5-vision-instruct"""
+
+    def __init__(
+        self,
+        processor: ProcessorMixin,
+        chat_template: Optional[str] = None,
+        image_size: int | tuple[int, int] | None = None,
+        image_resize_algorithm: Resampling | None = None,
+    ):
+        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
+        self.image_token = "<|image|>"  # nosec
+        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
+            self.image_token
+        )
+
+
 def get_processing_strategy(
    processor: ProcessorMixin,
    chat_template,
@@ -279,6 +296,10 @@ def get_processing_strategy(
        return Gemma3ProcessingStrategy(
            processor, chat_template, image_size, image_resize_algorithm
        )
+    if chat_template_type == "phi_35_vl":
+        return Phi35VLProcessingStrategy(
+            processor, chat_template, image_size, image_resize_algorithm
+        )
    if chat_template_type in [
        "llama3_2_vision",
        "llama4",
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -32,6 +32,7 @@ _CHAT_TEMPLATES = {
    "llava": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
    "phi_3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
    "phi_35": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
+    "phi_35_vl": "{% set image_count = namespace(value=0) %}{% for message in messages %}{{'<|' + message['role'] + '|>\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% set message_images = [] %}{% set message_text = [] %}{% for chunk in message['content'] %}{% if chunk['type'] == 'image' or 'image' in chunk or 'image_url' in chunk %}{% set image_count.value = image_count.value + 1 %}{% set _ = message_images.append('<|image_' + image_count.value|string + '|>\n') %}{% elif chunk['type'] == 'text' %}{% set _ = message_text.append(chunk['text']) %}{% endif %}{% endfor %}{{ message_images | join('') }}{{ message_text | join('') }}{% endif %}{{ '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
    "phi_4": "{% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
    "deepseek_v2": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}",
    "deepseek_v3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -48,6 +48,8 @@ class ChatTemplate(str, Enum):
    llama4 = "llama4"
    phi_3 = "phi_3"
    phi_35 = "phi_35"
+    phi_35_vl = "phi_35_vl"
+    phi_4 = "phi_4"
    deepseek_v2 = "deepseek_v2"
    deepseek_v3 = "deepseek_v3"
    jamba = "jamba"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,7 @@ import shutil
 import sys
 import tempfile
 import time
-from pathlib import Path
+from pathlib import Path, PosixPath
 from typing import Generator

 import datasets
@@ -423,13 +423,9 @@ def temp_dir() -> Generator[str, None, None]:
    shutil.rmtree(_temp_dir)


-@pytest.fixture(scope="module")
-def module_temp_dir() -> Generator[str, None, None]:
-    # Create a temporary directory
-    _temp_dir = tempfile.mkdtemp()
-    yield _temp_dir
-    # Clean up the directory after the test
-    shutil.rmtree(_temp_dir)
+@pytest.fixture(scope="function", autouse=True)
+def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
+    os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"


@pytest.fixture(scope="function", autouse=True)
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -2,8 +2,6 @@
 E2E tests for multigpu lora tinyllama
 """

-# pylint: disable=redefined-outer-name
-
 from pathlib import Path

 import pytest
@@ -27,60 +25,6 @@ def download_model():
    snapshot_download("HuggingFaceTB/SmolLM2-135M")


-@pytest.fixture(scope="module")
-def sft_base_cfg():
-    cfg = DictDefault(
-        base_model="HuggingFaceTB/SmolLM2-135M",
-        tokenizer_config="HuggingFaceTB/SmolLM2-135M",  # this has to be manually set since we haven't done validation
-        sequence_len=1024,
-        special_tokens={
-            "pad_token": "<|endoftext|>",
-        },
-        datasets=[
-            {
-                "path": "tatsu-lab/alpaca",
-                "type": "alpaca",
-                "split": "train[:10%]",
-            },
-        ],
-        val_set_size=0.1,
-        sample_packing=True,
-        flash_attention=True,
-        learning_rate=0.00001,
-        optimizer="adamw_8bit",
-        seed=42,
-        # these need to be set since we aren't running schema validation
-        micro_batch_size=2,
-        gradient_accumulation_steps=1,
-    )
-
-    return cfg
-
-
-@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
-def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
-    dataset_prepared_path = module_temp_dir + "/last_run_prepared"
-    cfg = sft_base_cfg | DictDefault(
-        dataset_prepared_path=dataset_prepared_path,
-    )
-
-    Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
-    with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-    execute_subprocess_async(
-        [
-            "axolotl",
-            "preprocess",
-            str(Path(module_temp_dir) / "config.yaml"),
-        ]
-    )
-
-    # unset flash attention since we have some flex attention tests too
-    cfg.flash_attention = None
-    return cfg
-
-
 def transformers_version_eq(required_version):
    return version.parse(transformers.__version__) == version.parse(required_version)

@@ -153,36 +97,45 @@ class TestMultiGPULlama:
        "gradient_accumulation_steps",
        [1, 2],
    )
-    def test_lora_ddp_packed(
-        self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
-    ):
+    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "eval_sample_packing": False,
-                    "pad_to_sequence_len": True,
-                    "adapter": "lora",
-                    "lora_r": 8,
-                    "lora_alpha": 16,
-                    "lora_dropout": 0.05,
-                    "lora_target_linear": True,
-                    "val_set_size": 0.05,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    # "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_8bit",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "use_tensorboard": True,
-                    "bf16": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sequence_len": 2048,
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "adapter": "lora",
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:20%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                # "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "use_tensorboard": True,
+                "bf16": True,
+            }
        )

        # write cfg to yaml file
@@ -432,50 +385,59 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "fsdp_state_dict_type",
        ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
    )
-    def test_fsdp_packed(
-        self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
-    ):
+    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 2,
-                    "gradient_accumulation_steps": 2,
-                    # "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "fsdp": [
-                        "full_shard",
-                        "auto_wrap",
-                    ],
-                    "fsdp_config": {
-                        "fsdp_limit_all_gathers": True,
-                        "fsdp_offload_params": False,
-                        "fsdp_sync_module_states": True,
-                        "fsdp_use_orig_params": False,
-                        "fsdp_cpu_ram_efficient_loading": False,
-                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                        "fsdp_state_dict_type": fsdp_state_dict_type,
-                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
                    },
-                    "use_tensorboard": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 2,
+                # "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": False,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": fsdp_state_dict_type,
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+                "use_tensorboard": True,
+            }
        )

        # write cfg to yaml file
@@ -496,7 +458,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @require_torch_2_6_0
@@ -509,43 +471,51 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_fsdp2_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        attention_backend,
-        fsdp_reshard_after_forward,
+        self, temp_dir, attention_backend, fsdp_reshard_after_forward
    ):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 4,
-                    "gradient_accumulation_steps": 2,
-                    "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_8bit",
-                    "lr_scheduler": "cosine",
-                    "fsdp": [
-                        "auto_wrap",
-                    ],
-                    "fsdp_config": {
-                        "fsdp_version": 2,
-                        # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
-                        "fsdp_offload_params": False,
-                        "fsdp_cpu_ram_efficient_loading": False,
-                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                        "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 2048,
+                "val_set_size": 0.1,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
                    },
-                    "use_tensorboard": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 4,
+                "gradient_accumulation_steps": 2,
+                "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_8bit",
+                "lr_scheduler": "cosine",
+                "fsdp": [
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_version": 2,
+                    # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
+                    "fsdp_offload_params": False,
+                    "fsdp_cpu_ram_efficient_loading": False,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
+                },
+                "use_tensorboard": True,
+            }
        )
        if attention_backend == "flash":
            cfg.flash_attention = True
@@ -573,55 +543,64 @@ class TestMultiGPULlama:
            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )

-    def test_fsdp_qlora_prequant_packed(
-        self, temp_dir, sft_prepared_dataset_alpaca_cfg
-    ):
+    def test_fsdp_qlora_prequant_packed(self, temp_dir):
        # pylint: disable=duplicate-code
-        cfg = (
-            DictDefault(
-                {
-                    "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
-                    "adapter": "qlora",
-                    "mean_resizing_embeddings": True,
-                    "load_in_4bit": True,
-                    "lora_r": 8,
-                    "lora_alpha": 16,
-                    "lora_dropout": 0.05,
-                    "lora_target_linear": True,
-                    # "lora_modules_to_save": [
-                    #     "embed_tokens",
-                    #     "lm_head",
-                    # ],
-                    "eval_sample_packing": False,
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 2,
-                    "gradient_accumulation_steps": 2,
-                    # "gradient_checkpointing": True,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "fsdp": [
-                        "full_shard",
-                        "auto_wrap",
-                    ],
-                    "fsdp_config": {
-                        "fsdp_limit_all_gathers": True,
-                        "fsdp_offload_params": False,
-                        "fsdp_sync_module_states": True,
-                        "fsdp_use_orig_params": False,
-                        "fsdp_cpu_ram_efficient_loading": True,
-                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+        cfg = DictDefault(
+            {
+                "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
+                "adapter": "qlora",
+                "mean_resizing_embeddings": True,
+                "load_in_4bit": True,
+                "lora_r": 8,
+                "lora_alpha": 16,
+                "lora_dropout": 0.05,
+                "lora_target_linear": True,
+                # "lora_modules_to_save": [
+                #     "embed_tokens",
+                #     "lm_head",
+                # ],
+                "sample_packing": True,
+                "eval_sample_packing": False,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
                    },
-                    "use_tensorboard": True,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 2,
+                # "gradient_checkpointing": True,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "fsdp": [
+                    "full_shard",
+                    "auto_wrap",
+                ],
+                "fsdp_config": {
+                    "fsdp_limit_all_gathers": True,
+                    "fsdp_offload_params": False,
+                    "fsdp_sync_module_states": True,
+                    "fsdp_use_orig_params": False,
+                    "fsdp_cpu_ram_efficient_loading": True,
+                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                },
+                "use_tensorboard": True,
+            }
        )

        # write cfg to yaml file
@@ -662,12 +641,7 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_ds_zero3_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        gradient_accumulation_steps,
-        deepspeed,
-        qlora,
+        self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
    ):
        # pylint: disable=duplicate-code
        if qlora:
@@ -681,25 +655,37 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "deepspeed": str(AXOLOTL_ROOT / deepspeed),
-                    "use_tensorboard": True,
-                    **adapter,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.05,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
+                "use_tensorboard": True,
+                **adapter,
+            }
        )

        # write cfg to yaml file
@@ -720,7 +706,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -731,13 +717,7 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero2_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        gradient_accumulation_steps,
-        qlora,
-    ):
+    def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -750,25 +730,37 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
-                    "use_tensorboard": True,
-                    **adapter,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
+                "use_tensorboard": True,
+                **adapter,
+            }
        )

        # write cfg to yaml file
@@ -789,7 +781,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -800,13 +792,7 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero1_packed(
-        self,
-        temp_dir,
-        sft_prepared_dataset_alpaca_cfg,
-        gradient_accumulation_steps,
-        qlora,
-    ):
+    def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -819,25 +805,37 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = (
-            DictDefault(
-                {
-                    "pad_to_sequence_len": True,
-                    "num_epochs": 1,
-                    "max_steps": 2,
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "output_dir": temp_dir,
-                    "learning_rate": 0.00001,
-                    "optimizer": "adamw_torch_fused",
-                    "lr_scheduler": "cosine",
-                    "flash_attention": True,
-                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
-                    "use_tensorboard": True,
-                    **adapter,
-                }
-            )
-            | sft_prepared_dataset_alpaca_cfg
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "sample_packing": True,
+                "pad_to_sequence_len": True,
+                "sequence_len": 1024,
+                "val_set_size": 0.01,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "tatsu-lab/alpaca",
+                        "type": "alpaca",
+                        "split": "train[:10%]",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 2,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "output_dir": temp_dir,
+                "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
+                "use_tensorboard": True,
+                **adapter,
+            }
        )

        # write cfg to yaml file
@@ -858,7 +856,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.skip(
Author	SHA1	Message	Date
NanoCode012	1f2f285173	fix: missing key in enum	2025-07-03 13:46:16 +08:00
NanoCode012	98e912e416	feat: add custom processing strategy for phi35 vl	2025-07-03 13:46:16 +08:00
NanoCode012	e1528fb381	feat: add phi_35_vl support	2025-07-03 13:46:16 +08:00