From c40da3b5eb6ccb21ef0c33dc228e52d4b1a37bbf Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sun, 6 Jul 2025 11:44:31 -0400
Subject: [PATCH] use shared fixture for preprocessed alpaca dataset

---
 tests/conftest.py                |   9 +
 tests/e2e/multigpu/test_llama.py | 535 +++++++++++++++----------------
 2 files changed, 262 insertions(+), 282 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index bbe2d10ee..51b5b1049 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -423,6 +423,15 @@ def temp_dir() -> Generator[str, None, None]:
     shutil.rmtree(_temp_dir)
 
 
+@pytest.fixture(scope="module")
+def module_temp_dir() -> Generator[str, None, None]:
+    # Create a temporary directory
+    _temp_dir = tempfile.mkdtemp()
+    yield _temp_dir
+    # Clean up the directory after the test
+    shutil.rmtree(_temp_dir)
+
+
 @pytest.fixture(scope="function", autouse=True)
 def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
     os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index d84505714..a2a3a4ea5 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -2,6 +2,8 @@
 E2E tests for multigpu lora tinyllama
 """
 
+# pylint: disable=redefined-outer-name
+
 from pathlib import Path
 
 import pytest
@@ -12,6 +14,8 @@ from huggingface_hub import snapshot_download
 from packaging import version
 from transformers.testing_utils import get_torch_dist_unique_port
 
+from axolotl.cli.args import PreprocessCliArgs
+from axolotl.cli.preprocess import do_preprocess
 from axolotl.utils.dict import DictDefault
 
 from tests.e2e.utils import check_tensorboard, require_torch_2_6_0
@@ -25,6 +29,40 @@ def download_model():
     snapshot_download("HuggingFaceTB/SmolLM2-135M")
 
 
+@pytest.fixture(scope="module")
+def sft_base_cfg():
+    cfg = DictDefault(
+        base_model="HuggingFaceTB/SmolLM2-135M",
+        sequence_len=2048,
+        special_tokens={
+            "pad_token": "<|endoftext|>",
+        },
+        datasets=[
+            {
+                "path": "tatsu-lab/alpaca",
+                "type": "alpaca",
+                "split": "train[:10%]",
+            },
+        ],
+        val_set_size=0.1,
+        sample_packing=True,
+        flash_attention=True,
+        learning_rate=0.00001,
+        optimizer="adamw_8bit",
+    )
+    return cfg
+
+
+@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
+def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
+    dataset_prepared_path = module_temp_dir + "/last_run_prepared"
+    cfg = sft_base_cfg | DictDefault(
+        dataset_prepared_path=dataset_prepared_path,
+    )
+    do_preprocess(cfg, PreprocessCliArgs())
+    return cfg
+
+
 def transformers_version_eq(required_version):
     return version.parse(transformers.__version__) == version.parse(required_version)
 
@@ -34,42 +72,31 @@ class TestMultiGPULlama:
     Test case for Llama models using LoRA
     """
 
-    def test_lora_ddp(self, temp_dir):
+    def test_lora_ddp(self, temp_dir, sft_prepared_dataset_alpaca_cfg):
         # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sequence_len": 2048,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 2,
-                # "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "use_tensorboard": True,
-                "bf16": True,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "adapter": "lora",
+                    "lora_r": 8,
+                    "lora_alpha": 16,
+                    "lora_dropout": 0.05,
+                    "lora_target_linear": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": 2,
+                    # "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_8bit",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "use_tensorboard": True,
+                    "bf16": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
         )
 
         # write cfg to yaml file
@@ -97,45 +124,36 @@ class TestMultiGPULlama:
         "gradient_accumulation_steps",
         [1, 2],
     )
-    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
+    def test_lora_ddp_packed(
+        self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
+    ):
         # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sequence_len": 2048,
-                "sample_packing": True,
-                "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:20%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                # "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "use_tensorboard": True,
-                "bf16": True,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "eval_sample_packing": False,
+                    "pad_to_sequence_len": True,
+                    "adapter": "lora",
+                    "lora_r": 8,
+                    "lora_alpha": 16,
+                    "lora_dropout": 0.05,
+                    "lora_target_linear": True,
+                    "val_set_size": 0.05,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    # "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_8bit",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "use_tensorboard": True,
+                    "bf16": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
         )
 
         # write cfg to yaml file
@@ -392,25 +410,13 @@ class TestMultiGPULlama:
         "fsdp_state_dict_type",
         ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
     )
-    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
+    def test_fsdp_packed(
+        self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
+    ):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
                 "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
                 "num_epochs": 1,
                 "max_steps": 2,
                 "micro_batch_size": 2,
@@ -438,6 +444,7 @@ class TestMultiGPULlama:
                 },
                 "use_tensorboard": True,
             }
+            | sft_prepared_dataset_alpaca_cfg
         )
 
         # write cfg to yaml file
@@ -471,51 +478,43 @@ class TestMultiGPULlama:
         [True, False],
     )
     def test_fsdp2_packed(
-        self, temp_dir, attention_backend, fsdp_reshard_after_forward
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        attention_backend,
+        fsdp_reshard_after_forward,
     ):
         # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 2048,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 4,
+                    "gradient_accumulation_steps": 2,
+                    "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_8bit",
+                    "lr_scheduler": "cosine",
+                    "fsdp": [
+                        "auto_wrap",
+                    ],
+                    "fsdp_config": {
+                        "fsdp_version": 2,
+                        # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
+                        "fsdp_offload_params": False,
+                        "fsdp_cpu_ram_efficient_loading": False,
+                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+                        "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                     },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 2,
-                "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_8bit",
-                "lr_scheduler": "cosine",
-                "fsdp": [
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_version": 2,
-                    # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
-                    "fsdp_offload_params": False,
-                    "fsdp_cpu_ram_efficient_loading": False,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
-                },
-                "use_tensorboard": True,
-            }
+                    "use_tensorboard": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
         )
         if attention_backend == "flash":
             cfg.flash_attention = True
@@ -543,64 +542,55 @@ class TestMultiGPULlama:
             temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
         )
 
-    def test_fsdp_qlora_prequant_packed(self, temp_dir):
+    def test_fsdp_qlora_prequant_packed(
+        self, temp_dir, sft_prepared_dataset_alpaca_cfg
+    ):
         # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
-                "adapter": "qlora",
-                "mean_resizing_embeddings": True,
-                "load_in_4bit": True,
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                # "lora_modules_to_save": [
-                #     "embed_tokens",
-                #     "lm_head",
-                # ],
-                "sample_packing": True,
-                "eval_sample_packing": False,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
+        cfg = (
+            DictDefault(
+                {
+                    "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
+                    "adapter": "qlora",
+                    "mean_resizing_embeddings": True,
+                    "load_in_4bit": True,
+                    "lora_r": 8,
+                    "lora_alpha": 16,
+                    "lora_dropout": 0.05,
+                    "lora_target_linear": True,
+                    # "lora_modules_to_save": [
+                    #     "embed_tokens",
+                    #     "lm_head",
+                    # ],
+                    "eval_sample_packing": False,
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 2,
+                    "gradient_accumulation_steps": 2,
+                    # "gradient_checkpointing": True,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "fsdp": [
+                        "full_shard",
+                        "auto_wrap",
+                    ],
+                    "fsdp_config": {
+                        "fsdp_limit_all_gathers": True,
+                        "fsdp_offload_params": False,
+                        "fsdp_sync_module_states": True,
+                        "fsdp_use_orig_params": False,
+                        "fsdp_cpu_ram_efficient_loading": True,
+                        "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
+                        "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                        "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                     },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 2,
-                # "gradient_checkpointing": True,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "fsdp": [
-                    "full_shard",
-                    "auto_wrap",
-                ],
-                "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
-                    "fsdp_offload_params": False,
-                    "fsdp_sync_module_states": True,
-                    "fsdp_use_orig_params": False,
-                    "fsdp_cpu_ram_efficient_loading": True,
-                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-                },
-                "use_tensorboard": True,
-            }
+                    "use_tensorboard": True,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
         )
 
         # write cfg to yaml file
@@ -641,7 +631,12 @@ class TestMultiGPULlama:
         [True, False],
     )
     def test_ds_zero3_packed(
-        self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        gradient_accumulation_steps,
+        deepspeed,
+        qlora,
     ):
         # pylint: disable=duplicate-code
         if qlora:
@@ -655,37 +650,25 @@ class TestMultiGPULlama:
             }
         else:
             adapter = {}
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.05,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
-                "use_tensorboard": True,
-                **adapter,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "deepspeed": str(AXOLOTL_ROOT / deepspeed),
+                    "use_tensorboard": True,
+                    **adapter,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
         )
 
         # write cfg to yaml file
@@ -717,7 +700,13 @@ class TestMultiGPULlama:
         "qlora",
         [True, False],
     )
-    def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
+    def test_ds_zero2_packed(
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        gradient_accumulation_steps,
+        qlora,
+    ):
         # pylint: disable=duplicate-code
         if qlora:
             adapter = {
@@ -730,37 +719,25 @@ class TestMultiGPULlama:
             }
         else:
             adapter = {}
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
-                "use_tensorboard": True,
-                **adapter,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
+                    "use_tensorboard": True,
+                    **adapter,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
         )
 
         # write cfg to yaml file
@@ -792,7 +769,13 @@ class TestMultiGPULlama:
         "qlora",
         [True, False],
     )
-    def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
+    def test_ds_zero1_packed(
+        self,
+        temp_dir,
+        sft_prepared_dataset_alpaca_cfg,
+        gradient_accumulation_steps,
+        qlora,
+    ):
         # pylint: disable=duplicate-code
         if qlora:
             adapter = {
@@ -805,37 +788,25 @@ class TestMultiGPULlama:
             }
         else:
             adapter = {}
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "sample_packing": True,
-                "pad_to_sequence_len": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.01,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "tatsu-lab/alpaca",
-                        "type": "alpaca",
-                        "split": "train[:10%]",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 2,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": gradient_accumulation_steps,
-                "output_dir": temp_dir,
-                "dataset_prepared_path": temp_dir + "/last_run_prepared",
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
-                "use_tensorboard": True,
-                **adapter,
-            }
+        cfg = (
+            DictDefault(
+                {
+                    "pad_to_sequence_len": True,
+                    "num_epochs": 1,
+                    "max_steps": 2,
+                    "micro_batch_size": 1,
+                    "gradient_accumulation_steps": gradient_accumulation_steps,
+                    "output_dir": temp_dir,
+                    "learning_rate": 0.00001,
+                    "optimizer": "adamw_torch_fused",
+                    "lr_scheduler": "cosine",
+                    "flash_attention": True,
+                    "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
+                    "use_tensorboard": True,
+                    **adapter,
+                }
+            )
+            | sft_prepared_dataset_alpaca_cfg
         )
 
         # write cfg to yaml file