tweak loss

add seed for stable reproducibility
tweak acceptable loss from changed hyperparams
2025-07-06 19:42:43 -04:00 · 2025-07-06 19:29:51 -04:00 · 2025-07-06 19:25:26 -04:00 · 2025-07-06 19:11:46 -04:00 · 2025-07-06 18:55:16 -04:00 · 2025-07-06 13:27:55 -04:00
6 changed files with 309 additions and 396 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,11 +5,13 @@ on:
    branches:
      - "main"
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:

--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -18,96 +18,9 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  preload-cache:
-    name: Preload HF cache
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.11"]
-        pytorch_version: ["2.6.0"]
-    timeout-minutes: 20
-
-    env:
-      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies
-
-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
-
-      - name: Install PyTorch
-        run: |
-          pip3 install torch==${{ matrix.pytorch_version }}
-
-      - name: Install dependencies
-        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Run tests
-        run: |
-          pytest -v tests/conftest.py
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
-
-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    needs: [preload-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
@@ -120,14 +33,11 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v4

-      - name: Restore HF cache
-        id: hf-cache-restore
-        uses: actions/cache/restore@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+      - name: Restore Cache from S3
+        id: hf-cache-restore-s3
+        run: |
+          mkdir -p /home/runner/.cache/huggingface/hub
+          curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -168,10 +78,6 @@ jobs:
        run: |
          axolotl --help

-      - name: Pre-Download dataset fixture
-        run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -193,15 +99,8 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.5.1
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,3 +37,7 @@ RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
+
+RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
+        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
+    fi
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -219,7 +219,9 @@ class TrainerBuilderBase(abc.ABC):
        if self.cfg.bf16 == "full":
            training_args_kwargs["bf16_full_eval"] = True
        else:
-            training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16
+            bf16 = self.cfg.bf16 or self.cfg.bfloat16
+            bf16 = bf16 if bf16 is not None else False
+            training_args_kwargs["bf16"] = bf16

    def _configure_scheduler(self, training_args_kwargs: dict):
        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -10,7 +10,7 @@ import shutil
 import sys
 import tempfile
 import time
-from pathlib import Path, PosixPath
+from pathlib import Path
 from typing import Generator

 import datasets
@@ -423,9 +423,13 @@ def temp_dir() -> Generator[str, None, None]:
    shutil.rmtree(_temp_dir)


-@pytest.fixture(scope="function", autouse=True)
-def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
-    os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
+@pytest.fixture(scope="module")
+def module_temp_dir() -> Generator[str, None, None]:
+    # Create a temporary directory
+    _temp_dir = tempfile.mkdtemp()
+    yield _temp_dir
+    # Clean up the directory after the test
+    shutil.rmtree(_temp_dir)


@pytest.fixture(scope="function", autouse=True)
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -2,6 +2,8 @@
 E2E tests for multigpu lora tinyllama
 """

+# pylint: disable=redefined-outer-name
+
 from pathlib import Path

 import pytest
@@ -25,6 +27,60 @@ def download_model():
    snapshot_download("HuggingFaceTB/SmolLM2-135M")


+@pytest.fixture(scope="module")
+def sft_base_cfg():
+    cfg = DictDefault(
+        base_model="HuggingFaceTB/SmolLM2-135M",
+        tokenizer_config="HuggingFaceTB/SmolLM2-135M",  # this has to be manually set since we haven't done validation
+        sequence_len=1024,
+        special_tokens={
+            "pad_token": "<|endoftext|>",
+        },
+        datasets=[
+            {
+                "path": "tatsu-lab/alpaca",
+                "type": "alpaca",
+                "split": "train[:10%]",
+            },
+        ],
+        val_set_size=0.1,
+        sample_packing=True,
+        flash_attention=True,
+        learning_rate=0.00001,
+        optimizer="adamw_8bit",
+        seed=42,
+        # these need to be set since we aren't running schema validation
+        micro_batch_size=2,
+        gradient_accumulation_steps=1,
+    )
+
+    return cfg
+
+
+@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
+def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
+    dataset_prepared_path = module_temp_dir + "/last_run_prepared"
+    cfg = sft_base_cfg | DictDefault(
+        dataset_prepared_path=dataset_prepared_path,
+    )
+
+    Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
+    with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+
+    execute_subprocess_async(
+        [
+            "axolotl",
+            "preprocess",
+            str(Path(module_temp_dir) / "config.yaml"),
+        ]
+    )
+
+    # unset flash attention since we have some flex attention tests too
+    cfg.flash_attention = None
+    return cfg
+
+
 def transformers_version_eq(required_version):
    return version.parse(transformers.__version__) == version.parse(required_version)

@@ -97,45 +153,36 @@ class TestMultiGPULlama:
        "gradient_accumulation_steps",
        [1, 2],
    )
-    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
+    def test_lora_ddp_packed(
+        self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
+    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sequence_len": 2048,
-                "sample_packing": True,
-                "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:20%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                # "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "use_tensorboard": True,
-                "bf16": True,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "eval_sample_packing": False,
+                    "pad_to_sequence_len": True,
+                    "adapter": "lora",
+                    "lora_r": 8,
+                    "lora_alpha": 16,
+                    "lora_dropout": 0.05,
+                    "lora_target_linear": True,
+                    "val_set_size": 0.05,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    # "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_8bit",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "use_tensorboard": True,
+                    "bf16": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
        )

        # write cfg to yaml file
@@ -385,59 +432,50 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "fsdp_state_dict_type",
        ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
    )
-    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
+    def test_fsdp_packed(
+        self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
+    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 2,
+                    "gradient_accumulation_steps": 2,
+                    # "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "dataset_prepared_path": temp_dir + "/last_run_prepared",
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "fsdp": [
+                        "full_shard",
+                        "auto_wrap",
+                    ],
+                    "fsdp_config": {
+                        "fsdp_limit_all_gathers": True,
+                        "fsdp_offload_params": False,
+                        "fsdp_sync_module_states": True,
+                        "fsdp_use_orig_params": False,
+                        "fsdp_cpu_ram_efficient_loading": False,
+                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                        "fsdp_state_dict_type": fsdp_state_dict_type,
+                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 2,
-                # "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": fsdp_state_dict_type,
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-                "use_tensorboard": True,
-            }
+                    "use_tensorboard": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
        )

        # write cfg to yaml file
@@ -458,7 +496,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
        )

    @require_torch_2_6_0
@@ -471,51 +509,43 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_fsdp2_packed(
-        self, temp_dir, attention_backend, fsdp_reshard_after_forward
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        attention_backend,
+        fsdp_reshard_after_forward,
    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 2048,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 4,
+                    "gradient_accumulation_steps": 2,
+                    "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_8bit",
+                    "lr_scheduler": "cosine",
+                    "fsdp": [
+                        "auto_wrap",
+                    ],
+                    "fsdp_config": {
+                        "fsdp_version": 2,
+                        # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
+                        "fsdp_offload_params": False,
+                        "fsdp_cpu_ram_efficient_loading": False,
+                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                        "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 2,
-                "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_8bit",
-                "lr_scheduler": "cosine",
-                "fsdp": [
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_version": 2,
-                    # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
-                    "fsdp_offload_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
-                },
-                "use_tensorboard": True,
-            }
+                    "use_tensorboard": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
        )
        if attention_backend == "flash":
            cfg.flash_attention = True
@@ -543,64 +573,55 @@ class TestMultiGPULlama:
            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )

-    def test_fsdp_qlora_prequant_packed(self, temp_dir):
+    def test_fsdp_qlora_prequant_packed(
+        self, temp_dir, sft_prepared_dataset_alpaca_cfg
+    ):
        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
-                "adapter": "qlora",
-                "mean_resizing_embeddings": True,
-                "load_in_4bit": True,
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                # "lora_modules_to_save": [
-                #     "embed_tokens",
-                #     "lm_head",
-                # ],
-                "sample_packing": True,
-                "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
+        cfg = (
+            DictDefault(
+                {
+                    "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
+                    "adapter": "qlora",
+                    "mean_resizing_embeddings": True,
+                    "load_in_4bit": True,
+                    "lora_r": 8,
+                    "lora_alpha": 16,
+                    "lora_dropout": 0.05,
+                    "lora_target_linear": True,
+                    # "lora_modules_to_save": [
+                    #     "embed_tokens",
+                    #     "lm_head",
+                    # ],
+                    "eval_sample_packing": False,
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 2,
+                    "gradient_accumulation_steps": 2,
+                    # "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "fsdp": [
+                        "full_shard",
+                        "auto_wrap",
+                    ],
+                    "fsdp_config": {
+                        "fsdp_limit_all_gathers": True,
+                        "fsdp_offload_params": False,
+                        "fsdp_sync_module_states": True,
+                        "fsdp_use_orig_params": False,
+                        "fsdp_cpu_ram_efficient_loading": True,
+                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 2,
-                # "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": True,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-                "use_tensorboard": True,
-            }
+                    "use_tensorboard": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
        )

        # write cfg to yaml file
@@ -641,7 +662,12 @@ class TestMultiGPULlama:
        [True, False],
    )
    def test_ds_zero3_packed(
-        self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        gradient_accumulation_steps,
+        deepspeed,
+        qlora,
    ):
        # pylint: disable=duplicate-code
        if qlora:
@@ -655,37 +681,25 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
-                "use_tensorboard": True,
-                **adapter,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "deepspeed": str(AXOLOTL_ROOT / deepspeed),
+                    "use_tensorboard": True,
+                    **adapter,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
        )

        # write cfg to yaml file
@@ -706,7 +720,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -717,7 +731,13 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
+    def test_ds_zero2_packed(
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        gradient_accumulation_steps,
+        qlora,
+    ):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -730,37 +750,25 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
-                "use_tensorboard": True,
-                **adapter,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
+                    "use_tensorboard": True,
+                    **adapter,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
        )

        # write cfg to yaml file
@@ -781,7 +789,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -792,7 +800,13 @@ class TestMultiGPULlama:
        "qlora",
        [True, False],
    )
-    def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
+    def test_ds_zero1_packed(
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        gradient_accumulation_steps,
+        qlora,
+    ):
        # pylint: disable=duplicate-code
        if qlora:
            adapter = {
@@ -805,37 +819,25 @@ class TestMultiGPULlama:
            }
        else:
            adapter = {}
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
-                "use_tensorboard": True,
-                **adapter,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
+                    "use_tensorboard": True,
+                    **adapter,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
        )

        # write cfg to yaml file
@@ -856,7 +858,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )

    @pytest.mark.skip(
Author	SHA1	Message	Date
Wing Lian	b79996bdc4	tweak loss	2025-07-06 19:42:43 -04:00
Wing Lian	68368de7ed	add seed for stable reproducibility	2025-07-06 19:29:51 -04:00
Wing Lian	a94c4a014b	tweak acceptable loss from changed hyperparams	2025-07-06 19:25:26 -04:00
Wing Lian	0102ca5943	fix cfg merge	2025-07-06 19:11:46 -04:00
Wing Lian	97e8c01a70	tweak losses	2025-07-06 18:55:16 -04:00
Wing Lian	5c4705b185	unset fa	2025-07-06 13:27:55 -04:00
Wing Lian	47a88da330	set mbsz and revert non-packed test	2025-07-06 12:27:25 -04:00
Wing Lian	07ab737a55	set tokenizer_config in fixture	2025-07-06 12:24:21 -04:00
Wing Lian	c40da3b5eb	use shared fixture for preprocessed alpaca dataset	2025-07-06 11:44:31 -04:00
Wing Lian	a5946ff1f0	build fa2 from source for base image with torch2.6 and cu124 (#2867 )	2025-07-05 09:21:18 -04:00
Wing Lian	70ca1b2291	fix nightlies to use correct cache (#2848 ) [skip ci] * fix nightlies to use correct cache * fix for handling None for bf16	2025-07-03 12:21:39 -04:00