tweak loss

add seed for stable reproducibility
tweak acceptable loss from changed hyperparams
2025-07-06 19:42:43 -04:00 · 2025-07-06 19:29:51 -04:00 · 2025-07-06 19:25:26 -04:00 · 2025-07-06 19:11:46 -04:00 · 2025-07-06 18:55:16 -04:00 · 2025-07-06 13:27:55 -04:00
11 changed files with 309 additions and 433 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,11 +5,13 @@ on:
    branches:
      - "main"
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,96 +18,9 @@ jobs:
        env:
          SKIP: no-commit-to-branch
  preload-cache:
    name: Preload HF cache
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.6.0"]
    timeout-minutes: 20
    env:
      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
    steps:
      - name: Check out repository code
        uses: actions/checkout@v4
      - name: Restore HF cache
        id: hf-cache-restore
        uses: actions/cache/restore@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ runner.os }}-hf-hub-cache-v2
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
          pip3 install torch==${{ matrix.pytorch_version }}
      - name: Install dependencies
        run: |
          pip3 show torch
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help
      - name: Pre-Download dataset fixture
        run: |
          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Run tests
        run: |
          pytest -v tests/conftest.py
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false
      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
      - name: Save HF cache
        id: hf-cache
        uses: actions/cache/save@v4
        with:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
@@ -120,14 +33,11 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4
-      - name: Restore HF cache
+      - name: Restore Cache from S3
-        id: hf-cache-restore
+        id: hf-cache-restore-s3
-        uses: actions/cache/restore@v4
+        run: |
-        with:
+          mkdir -p /home/runner/.cache/huggingface/hub
-          path: |
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
          key: ${{ runner.os }}-hf-hub-cache-v2
      - name: Setup Python
        uses: actions/setup-python@v5
@@ -168,10 +78,6 @@ jobs:
        run: |
          axolotl --help
      - name: Pre-Download dataset fixture
        run: |
          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -193,15 +99,8 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
+          - cuda: 126
-            cuda_version: 12.4.1
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.5.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,3 +37,7 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
 RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
    fi
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -16,7 +16,6 @@ format:
 - [Gemma-3](#sec-gemma-3)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
 - [Phi3-V](#sec-phi3-v)
 ## Usage
@@ -127,15 +126,6 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```
 ### Phi3-V {#sec-phi3-v}
 ```yaml
 base_model: microsoft/Phi-3.5-vision-instruct
 trust_remote_code: true
 chat_template: phi_35_vl
 ```
 ## Dataset Format
 For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -219,7 +219,9 @@ class TrainerBuilderBase(abc.ABC):
        if self.cfg.bf16 == "full":
            training_args_kwargs["bf16_full_eval"] = True
        else:
-            training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16
+            bf16 = self.cfg.bf16 or self.cfg.bfloat16
            bf16 = bf16 if bf16 is not None else False
            training_args_kwargs["bf16"] = bf16
    def _configure_scheduler(self, training_args_kwargs: dict):
        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,7 +1,6 @@
 """Shared constants for axolotl.loaders module"""
 from transformers import (
    AutoModelForCausalLM,
    Gemma3ForConditionalGeneration,
    Llama4ForConditionalGeneration,
    LlavaForConditionalGeneration,
@@ -19,6 +18,4 @@ MULTIMODAL_AUTO_MODEL_MAPPING = {
    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
    "mistral3": Mistral3ForConditionalGeneration,
    "gemma3": Gemma3ForConditionalGeneration,
    # phi3_v modeling code is not available in transformers yet
    "phi3_v": AutoModelForCausalLM,
 }
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -264,23 +264,6 @@ class Gemma3ProcessingStrategy(ProcessingStrategy):
        return labels
 class Phi35VLProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Phi-3.5-vision-instruct"""
    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
        self.image_token = "<|image|>"  # nosec
        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
            self.image_token
        )
 def get_processing_strategy(
    processor: ProcessorMixin,
    chat_template,
@@ -296,10 +279,6 @@ def get_processing_strategy(
        return Gemma3ProcessingStrategy(
            processor, chat_template, image_size, image_resize_algorithm
        )
    if chat_template_type == "phi_35_vl":
        return Phi35VLProcessingStrategy(
            processor, chat_template, image_size, image_resize_algorithm
        )
    if chat_template_type in [
        "llama3_2_vision",
        "llama4",
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -32,7 +32,6 @@ _CHAT_TEMPLATES = {
    "llava": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
    "phi_3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
    "phi_35": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
    "phi_35_vl": "{% set image_count = namespace(value=0) %}{% for message in messages %}{{'<|' + message['role'] + '|>\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% set message_images = [] %}{% set message_text = [] %}{% for chunk in message['content'] %}{% if chunk['type'] == 'image' or 'image' in chunk or 'image_url' in chunk %}{% set image_count.value = image_count.value + 1 %}{% set _ = message_images.append('<|image_' + image_count.value|string + '|>\n') %}{% elif chunk['type'] == 'text' %}{% set _ = message_text.append(chunk['text']) %}{% endif %}{% endfor %}{{ message_images | join('') }}{{ message_text | join('') }}{% endif %}{{ '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
    "phi_4": "{% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
    "deepseek_v2": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}",
    "deepseek_v3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}",
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -48,8 +48,6 @@ class ChatTemplate(str, Enum):
    llama4 = "llama4"
    phi_3 = "phi_3"
    phi_35 = "phi_35"
    phi_35_vl = "phi_35_vl"
    phi_4 = "phi_4"
    deepseek_v2 = "deepseek_v2"
    deepseek_v3 = "deepseek_v3"
    jamba = "jamba"
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,7 @@ import shutil
 import sys
 import tempfile
 import time
-from pathlib import Path, PosixPath
+from pathlib import Path
 from typing import Generator
 import datasets
@@ -423,9 +423,13 @@ def temp_dir() -> Generator[str, None, None]:
    shutil.rmtree(_temp_dir)
-@pytest.fixture(scope="function", autouse=True)
+@pytest.fixture(scope="module")
-def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
+def module_temp_dir() -> Generator[str, None, None]:
-    os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
+    # Create a temporary directory
    _temp_dir = tempfile.mkdtemp()
    yield _temp_dir
    # Clean up the directory after the test
    shutil.rmtree(_temp_dir)
@pytest.fixture(scope="function", autouse=True)
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -2,6 +2,8 @@
 E2E tests for multigpu lora tinyllama
 """
 # pylint: disable=redefined-outer-name
 from pathlib import Path
 import pytest
@@ -25,6 +27,60 @@ def download_model():
    snapshot_download("HuggingFaceTB/SmolLM2-135M")
@pytest.fixture(scope="module")
 def sft_base_cfg():
    cfg = DictDefault(
        base_model="HuggingFaceTB/SmolLM2-135M",
        tokenizer_config="HuggingFaceTB/SmolLM2-135M",  # this has to be manually set since we haven't done validation
        sequence_len=1024,
        special_tokens={
            "pad_token": "<|endoftext|>",
        },
        datasets=[
            {
                "path": "tatsu-lab/alpaca",
                "type": "alpaca",
                "split": "train[:10%]",
            },
        ],
        val_set_size=0.1,
        sample_packing=True,
        flash_attention=True,
        learning_rate=0.00001,
        optimizer="adamw_8bit",
        seed=42,
        # these need to be set since we aren't running schema validation
        micro_batch_size=2,
        gradient_accumulation_steps=1,
    )
    return cfg
@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
 def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
    dataset_prepared_path = module_temp_dir + "/last_run_prepared"
    cfg = sft_base_cfg | DictDefault(
        dataset_prepared_path=dataset_prepared_path,
    )
    Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
    with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
    execute_subprocess_async(
        [
            "axolotl",
            "preprocess",
            str(Path(module_temp_dir) / "config.yaml"),
        ]
    )
    # unset flash attention since we have some flex attention tests too
    cfg.flash_attention = None
    return cfg
 def transformers_version_eq(required_version):
    return version.parse(transformers.__version__) == version.parse(required_version)
@@ -97,45 +153,36 @@ class TestMultiGPULlama:
        "gradient_accumulation_steps",
        [1, 2],
    )
-    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
+    def test_lora_ddp_packed(
        self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
+        cfg = (
-            {
+            DictDefault(
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                {
-                "sequence_len": 2048,
+                    "eval_sample_packing": False,
-                "sample_packing": True,
+                    "pad_to_sequence_len": True,
-                "eval_sample_packing": False,
+                    "adapter": "lora",
-                "pad_to_sequence_len": True,
+                    "lora_r": 8,
-                "adapter": "lora",
+                    "lora_alpha": 16,
-                "lora_r": 8,
+                    "lora_dropout": 0.05,
-                "lora_alpha": 16,
+                    "lora_target_linear": True,
-                "lora_dropout": 0.05,
+                    "val_set_size": 0.05,
-                "lora_target_linear": True,
+                    "num_epochs": 1,
-                "val_set_size": 0.05,
+                    "max_steps": 2,
-                "special_tokens": {
+                    "micro_batch_size": 1,
-                    "pad_token": "<|endoftext|>",
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                },
+                    # "gradient_checkpointing": True,
-                "datasets": [
+                    "output_dir": temp_dir,
-                    {
+                    "learning_rate": 0.00001,
-                        "path": "tatsu-lab/alpaca",
+                    "optimizer": "adamw_8bit",
-                        "type": "alpaca",
+                    "lr_scheduler": "cosine",
-                        "split": "train[:20%]",
+                    "flash_attention": True,
-                    },
+                    "use_tensorboard": True,
-                ],
+                    "bf16": True,
-                "num_epochs": 1,
+                }
-                "max_steps": 2,
+            )
-                "micro_batch_size": 1,
+            | sft_prepared_dataset_alpaca_cfg
                "gradient_accumulation_steps": gradient_accumulation_steps,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
            }
        )
        # write cfg to yaml file
@@ -385,59 +432,50 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )
    @pytest.mark.parametrize(
        "fsdp_state_dict_type",
        ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
    )
-    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
+    def test_fsdp_packed(
        self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
+        cfg = (
-            {
+            DictDefault(
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                {
-                "sample_packing": True,
+                    "pad_to_sequence_len": True,
-                "pad_to_sequence_len": True,
+                    "num_epochs": 1,
-                "sequence_len": 1024,
+                    "max_steps": 2,
-                "val_set_size": 0.05,
+                    "micro_batch_size": 2,
-                "special_tokens": {
+                    "gradient_accumulation_steps": 2,
-                    "pad_token": "<|endoftext|>",
+                    # "gradient_checkpointing": True,
-                },
+                    "output_dir": temp_dir,
-                "datasets": [
+                    "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                    {
+                    "learning_rate": 0.00001,
-                        "path": "tatsu-lab/alpaca",
+                    "optimizer": "adamw_torch_fused",
-                        "type": "alpaca",
+                    "lr_scheduler": "cosine",
-                        "split": "train[:10%]",
+                    "flash_attention": True,
                    "fsdp": [
                        "full_shard",
                        "auto_wrap",
                    ],
                    "fsdp_config": {
                        "fsdp_limit_all_gathers": True,
                        "fsdp_offload_params": False,
                        "fsdp_sync_module_states": True,
                        "fsdp_use_orig_params": False,
                        "fsdp_cpu_ram_efficient_loading": False,
                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                        "fsdp_state_dict_type": fsdp_state_dict_type,
                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    },
-                ],
+                    "use_tensorboard": True,
-                "num_epochs": 1,
+                }
-                "max_steps": 2,
+            )
-                "micro_batch_size": 2,
+            | sft_prepared_dataset_alpaca_cfg
                "gradient_accumulation_steps": 2,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp": [
                    "full_shard",
                    "auto_wrap",
                ],
                "fsdp_config": {
                    "fsdp_limit_all_gathers": True,
                    "fsdp_offload_params": False,
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                    "fsdp_cpu_ram_efficient_loading": False,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "fsdp_state_dict_type": fsdp_state_dict_type,
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
            }
        )
        # write cfg to yaml file
@@ -458,7 +496,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
        )
    @require_torch_2_6_0
@@ -471,51 +509,43 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_fsdp2_packed(
-        self, temp_dir, attention_backend, fsdp_reshard_after_forward
+        self,
        temp_dir,
        sft_prepared_dataset_alpaca_cfg,
        attention_backend,
        fsdp_reshard_after_forward,
    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
+        cfg = (
-            {
+            DictDefault(
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                {
-                "sample_packing": True,
+                    "pad_to_sequence_len": True,
-                "pad_to_sequence_len": True,
+                    "num_epochs": 1,
-                "sequence_len": 2048,
+                    "max_steps": 2,
-                "val_set_size": 0.1,
+                    "micro_batch_size": 4,
-                "special_tokens": {
+                    "gradient_accumulation_steps": 2,
-                    "pad_token": "<|endoftext|>",
+                    "gradient_checkpointing": True,
-                },
+                    "output_dir": temp_dir,
-                "datasets": [
+                    "learning_rate": 0.00001,
-                    {
+                    "optimizer": "adamw_torch_8bit",
-                        "path": "tatsu-lab/alpaca",
+                    "lr_scheduler": "cosine",
-                        "type": "alpaca",
+                    "fsdp": [
-                        "split": "train[:10%]",
+                        "auto_wrap",
                    ],
                    "fsdp_config": {
                        "fsdp_version": 2,
                        # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
                        "fsdp_offload_params": False,
                        "fsdp_cpu_ram_efficient_loading": False,
                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                        "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                    },
-                ],
+                    "use_tensorboard": True,
-                "num_epochs": 1,
+                }
-                "max_steps": 2,
+            )
-                "micro_batch_size": 4,
+            | sft_prepared_dataset_alpaca_cfg
                "gradient_accumulation_steps": 2,
                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_8bit",
                "lr_scheduler": "cosine",
                "fsdp": [
                    "auto_wrap",
                ],
                "fsdp_config": {
                    "fsdp_version": 2,
                    # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
                    "fsdp_offload_params": False,
                    "fsdp_cpu_ram_efficient_loading": False,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                },
                "use_tensorboard": True,
            }
        )
        if attention_backend == "flash":
            cfg.flash_attention = True
@@ -543,64 +573,55 @@ class TestMultiGPULlama:
            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )
-    def test_fsdp_qlora_prequant_packed(self, temp_dir):
+    def test_fsdp_qlora_prequant_packed(
        self, temp_dir, sft_prepared_dataset_alpaca_cfg
    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
+        cfg = (
-            {
+            DictDefault(
-                "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
+                {
-                "adapter": "qlora",
+                    "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
-                "mean_resizing_embeddings": True,
+                    "adapter": "qlora",
-                "load_in_4bit": True,
+                    "mean_resizing_embeddings": True,
-                "lora_r": 8,
+                    "load_in_4bit": True,
-                "lora_alpha": 16,
+                    "lora_r": 8,
-                "lora_dropout": 0.05,
+                    "lora_alpha": 16,
-                "lora_target_linear": True,
+                    "lora_dropout": 0.05,
-                # "lora_modules_to_save": [
+                    "lora_target_linear": True,
-                #     "embed_tokens",
+                    # "lora_modules_to_save": [
-                #     "lm_head",
+                    #     "embed_tokens",
-                # ],
+                    #     "lm_head",
-                "sample_packing": True,
+                    # ],
-                "eval_sample_packing": False,
+                    "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
+                    "pad_to_sequence_len": True,
-                "sequence_len": 1024,
+                    "num_epochs": 1,
-                "val_set_size": 0.01,
+                    "max_steps": 2,
-                "special_tokens": {
+                    "micro_batch_size": 2,
-                    "pad_token": "<|endoftext|>",
+                    "gradient_accumulation_steps": 2,
-                },
+                    # "gradient_checkpointing": True,
-                "datasets": [
+                    "output_dir": temp_dir,
-                    {
+                    "learning_rate": 0.00001,
-                        "path": "tatsu-lab/alpaca",
+                    "optimizer": "adamw_torch_fused",
-                        "type": "alpaca",
+                    "lr_scheduler": "cosine",
-                        "split": "train[:10%]",
+                    "flash_attention": True,
                    "fsdp": [
                        "full_shard",
                        "auto_wrap",
                    ],
                    "fsdp_config": {
                        "fsdp_limit_all_gathers": True,
                        "fsdp_offload_params": False,
                        "fsdp_sync_module_states": True,
                        "fsdp_use_orig_params": False,
                        "fsdp_cpu_ram_efficient_loading": True,
                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    },
-                ],
+                    "use_tensorboard": True,
-                "num_epochs": 1,
+                }
-                "max_steps": 2,
+            )
-                "micro_batch_size": 2,
+            | sft_prepared_dataset_alpaca_cfg
                "gradient_accumulation_steps": 2,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp": [
                    "full_shard",
                    "auto_wrap",
                ],
                "fsdp_config": {
                    "fsdp_limit_all_gathers": True,
                    "fsdp_offload_params": False,
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                    "fsdp_cpu_ram_efficient_loading": True,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
            }
        )
        # write cfg to yaml file
@@ -641,7 +662,12 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_ds_zero3_packed(
-        self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
+        self,
        temp_dir,
        sft_prepared_dataset_alpaca_cfg,
        gradient_accumulation_steps,
        deepspeed,
        qlora,
    ):
        # pylint: disable=duplicate-code
        if qlora:
@@ -655,37 +681,25 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = DictDefault(
+        cfg = (
-            {
+            DictDefault(
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                {
-                "sample_packing": True,
+                    "pad_to_sequence_len": True,
-                "pad_to_sequence_len": True,
+                    "num_epochs": 1,
-                "sequence_len": 1024,
+                    "max_steps": 2,
-                "val_set_size": 0.05,
+                    "micro_batch_size": 1,
-                "special_tokens": {
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "pad_token": "<|endoftext|>",
+                    "output_dir": temp_dir,
-                },
+                    "learning_rate": 0.00001,
-                "datasets": [
+                    "optimizer": "adamw_torch_fused",
-                    {
+                    "lr_scheduler": "cosine",
-                        "path": "tatsu-lab/alpaca",
+                    "flash_attention": True,
-                        "type": "alpaca",
+                    "deepspeed": str(AXOLOTL_ROOT / deepspeed),
-                        "split": "train[:10%]",
+                    "use_tensorboard": True,
-                    },
+                    **adapter,
-                ],
+                }
-                "num_epochs": 1,
+            )
-                "max_steps": 2,
+            | sft_prepared_dataset_alpaca_cfg
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
                "use_tensorboard": True,
                **adapter,
            }
        )
        # write cfg to yaml file
@@ -706,7 +720,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -717,7 +731,13 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
+    def test_ds_zero2_packed(
        self,
        temp_dir,
        sft_prepared_dataset_alpaca_cfg,
        gradient_accumulation_steps,
        qlora,
    ):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -730,37 +750,25 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = DictDefault(
+        cfg = (
-            {
+            DictDefault(
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                {
-                "sample_packing": True,
+                    "pad_to_sequence_len": True,
-                "pad_to_sequence_len": True,
+                    "num_epochs": 1,
-                "sequence_len": 1024,
+                    "max_steps": 2,
-                "val_set_size": 0.01,
+                    "micro_batch_size": 1,
-                "special_tokens": {
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "pad_token": "<|endoftext|>",
+                    "output_dir": temp_dir,
-                },
+                    "learning_rate": 0.00001,
-                "datasets": [
+                    "optimizer": "adamw_torch_fused",
-                    {
+                    "lr_scheduler": "cosine",
-                        "path": "tatsu-lab/alpaca",
+                    "flash_attention": True,
-                        "type": "alpaca",
+                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
-                        "split": "train[:10%]",
+                    "use_tensorboard": True,
-                    },
+                    **adapter,
-                ],
+                }
-                "num_epochs": 1,
+            )
-                "max_steps": 2,
+            | sft_prepared_dataset_alpaca_cfg
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
                **adapter,
            }
        )
        # write cfg to yaml file
@@ -781,7 +789,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )
    @pytest.mark.parametrize(
@@ -792,7 +800,13 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
+    def test_ds_zero1_packed(
        self,
        temp_dir,
        sft_prepared_dataset_alpaca_cfg,
        gradient_accumulation_steps,
        qlora,
    ):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -805,37 +819,25 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = DictDefault(
+        cfg = (
-            {
+            DictDefault(
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                {
-                "sample_packing": True,
+                    "pad_to_sequence_len": True,
-                "pad_to_sequence_len": True,
+                    "num_epochs": 1,
-                "sequence_len": 1024,
+                    "max_steps": 2,
-                "val_set_size": 0.01,
+                    "micro_batch_size": 1,
-                "special_tokens": {
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
-                    "pad_token": "<|endoftext|>",
+                    "output_dir": temp_dir,
-                },
+                    "learning_rate": 0.00001,
-                "datasets": [
+                    "optimizer": "adamw_torch_fused",
-                    {
+                    "lr_scheduler": "cosine",
-                        "path": "tatsu-lab/alpaca",
+                    "flash_attention": True,
-                        "type": "alpaca",
+                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
-                        "split": "train[:10%]",
+                    "use_tensorboard": True,
-                    },
+                    **adapter,
-                ],
+                }
-                "num_epochs": 1,
+            )
-                "max_steps": 2,
+            | sft_prepared_dataset_alpaca_cfg
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
                **adapter,
            }
        )
        # write cfg to yaml file
@@ -856,7 +858,7 @@ class TestMultiGPULlama:
        )
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )
    @pytest.mark.skip(
Author	SHA1	Message	Date
Wing Lian	b79996bdc4	tweak loss	2025-07-06 19:42:43 -04:00
Wing Lian	68368de7ed	add seed for stable reproducibility	2025-07-06 19:29:51 -04:00
Wing Lian	a94c4a014b	tweak acceptable loss from changed hyperparams	2025-07-06 19:25:26 -04:00
Wing Lian	0102ca5943	fix cfg merge	2025-07-06 19:11:46 -04:00
Wing Lian	97e8c01a70	tweak losses	2025-07-06 18:55:16 -04:00
Wing Lian	5c4705b185	unset fa	2025-07-06 13:27:55 -04:00
Wing Lian	47a88da330	set mbsz and revert non-packed test	2025-07-06 12:27:25 -04:00
Wing Lian	07ab737a55	set tokenizer_config in fixture	2025-07-06 12:24:21 -04:00
Wing Lian	c40da3b5eb	use shared fixture for preprocessed alpaca dataset	2025-07-06 11:44:31 -04:00
Wing Lian	a5946ff1f0	build fa2 from source for base image with torch2.6 and cu124 (#2867 )	2025-07-05 09:21:18 -04:00
Wing Lian	70ca1b2291	fix nightlies to use correct cache (#2848 ) [skip ci] * fix nightlies to use correct cache * fix for handling None for bf16	2025-07-03 12:21:39 -04:00