From c40da3b5eb6ccb21ef0c33dc228e52d4b1a37bbf Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 6 Jul 2025 11:44:31 -0400 Subject: [PATCH] use shared fixture for preprocessed alpaca dataset --- tests/conftest.py | 9 + tests/e2e/multigpu/test_llama.py | 535 +++++++++++++++---------------- 2 files changed, 262 insertions(+), 282 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index bbe2d10ee..51b5b1049 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -423,6 +423,15 @@ def temp_dir() -> Generator[str, None, None]: shutil.rmtree(_temp_dir) +@pytest.fixture(scope="module") +def module_temp_dir() -> Generator[str, None, None]: + # Create a temporary directory + _temp_dir = tempfile.mkdtemp() + yield _temp_dir + # Clean up the directory after the test + shutil.rmtree(_temp_dir) + + @pytest.fixture(scope="function", autouse=True) def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None: os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache" diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index d84505714..a2a3a4ea5 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -2,6 +2,8 @@ E2E tests for multigpu lora tinyllama """ +# pylint: disable=redefined-outer-name + from pathlib import Path import pytest @@ -12,6 +14,8 @@ from huggingface_hub import snapshot_download from packaging import version from transformers.testing_utils import get_torch_dist_unique_port +from axolotl.cli.args import PreprocessCliArgs +from axolotl.cli.preprocess import do_preprocess from axolotl.utils.dict import DictDefault from tests.e2e.utils import check_tensorboard, require_torch_2_6_0 @@ -25,6 +29,40 @@ def download_model(): snapshot_download("HuggingFaceTB/SmolLM2-135M") +@pytest.fixture(scope="module") +def sft_base_cfg(): + cfg = DictDefault( + base_model="HuggingFaceTB/SmolLM2-135M", + sequence_len=2048, + special_tokens={ + "pad_token": "<|endoftext|>", + }, + datasets=[ + { + "path": "tatsu-lab/alpaca", + "type": "alpaca", + "split": "train[:10%]", + }, + ], + val_set_size=0.1, + sample_packing=True, + flash_attention=True, + learning_rate=0.00001, + optimizer="adamw_8bit", + ) + return cfg + + +@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg") +def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg): + dataset_prepared_path = module_temp_dir + "/last_run_prepared" + cfg = sft_base_cfg | DictDefault( + dataset_prepared_path=dataset_prepared_path, + ) + do_preprocess(cfg, PreprocessCliArgs()) + return cfg + + def transformers_version_eq(required_version): return version.parse(transformers.__version__) == version.parse(required_version) @@ -34,42 +72,31 @@ class TestMultiGPULlama: Test case for Llama models using LoRA """ - def test_lora_ddp(self, temp_dir): + def test_lora_ddp(self, temp_dir, sft_prepared_dataset_alpaca_cfg): # pylint: disable=duplicate-code - cfg = DictDefault( - { - "base_model": "HuggingFaceTB/SmolLM2-135M", - "sequence_len": 2048, - "adapter": "lora", - "lora_r": 8, - "lora_alpha": 16, - "lora_dropout": 0.05, - "lora_target_linear": True, - "val_set_size": 0.01, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:10%]", - }, - ], - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 1, - "gradient_accumulation_steps": 2, - # "gradient_checkpointing": True, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_8bit", - "lr_scheduler": "cosine", - "flash_attention": True, - "use_tensorboard": True, - "bf16": True, - } + cfg = ( + DictDefault( + { + "adapter": "lora", + "lora_r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_linear": True, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 1, + "gradient_accumulation_steps": 2, + # "gradient_checkpointing": True, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_8bit", + "lr_scheduler": "cosine", + "flash_attention": True, + "use_tensorboard": True, + "bf16": True, + } + ) + | sft_prepared_dataset_alpaca_cfg ) # write cfg to yaml file @@ -97,45 +124,36 @@ class TestMultiGPULlama: "gradient_accumulation_steps", [1, 2], ) - def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps): + def test_lora_ddp_packed( + self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps + ): # pylint: disable=duplicate-code - cfg = DictDefault( - { - "base_model": "HuggingFaceTB/SmolLM2-135M", - "sequence_len": 2048, - "sample_packing": True, - "eval_sample_packing": False, - "pad_to_sequence_len": True, - "adapter": "lora", - "lora_r": 8, - "lora_alpha": 16, - "lora_dropout": 0.05, - "lora_target_linear": True, - "val_set_size": 0.05, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:20%]", - }, - ], - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 1, - "gradient_accumulation_steps": gradient_accumulation_steps, - # "gradient_checkpointing": True, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_8bit", - "lr_scheduler": "cosine", - "flash_attention": True, - "use_tensorboard": True, - "bf16": True, - } + cfg = ( + DictDefault( + { + "eval_sample_packing": False, + "pad_to_sequence_len": True, + "adapter": "lora", + "lora_r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_linear": True, + "val_set_size": 0.05, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 1, + "gradient_accumulation_steps": gradient_accumulation_steps, + # "gradient_checkpointing": True, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_8bit", + "lr_scheduler": "cosine", + "flash_attention": True, + "use_tensorboard": True, + "bf16": True, + } + ) + | sft_prepared_dataset_alpaca_cfg ) # write cfg to yaml file @@ -392,25 +410,13 @@ class TestMultiGPULlama: "fsdp_state_dict_type", ["FULL_STATE_DICT", "SHARDED_STATE_DICT"], ) - def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type): + def test_fsdp_packed( + self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type + ): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "HuggingFaceTB/SmolLM2-135M", - "sample_packing": True, "pad_to_sequence_len": True, - "sequence_len": 1024, - "val_set_size": 0.05, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:10%]", - }, - ], "num_epochs": 1, "max_steps": 2, "micro_batch_size": 2, @@ -438,6 +444,7 @@ class TestMultiGPULlama: }, "use_tensorboard": True, } + | sft_prepared_dataset_alpaca_cfg ) # write cfg to yaml file @@ -471,51 +478,43 @@ class TestMultiGPULlama: [True, False], ) def test_fsdp2_packed( - self, temp_dir, attention_backend, fsdp_reshard_after_forward + self, + temp_dir, + sft_prepared_dataset_alpaca_cfg, + attention_backend, + fsdp_reshard_after_forward, ): # pylint: disable=duplicate-code - cfg = DictDefault( - { - "base_model": "HuggingFaceTB/SmolLM2-135M", - "sample_packing": True, - "pad_to_sequence_len": True, - "sequence_len": 2048, - "val_set_size": 0.1, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:10%]", + cfg = ( + DictDefault( + { + "pad_to_sequence_len": True, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 4, + "gradient_accumulation_steps": 2, + "gradient_checkpointing": True, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_torch_8bit", + "lr_scheduler": "cosine", + "fsdp": [ + "auto_wrap", + ], + "fsdp_config": { + "fsdp_version": 2, + # "fsdp_forward_prefetch": True, # not yet implemented in accelerate + "fsdp_offload_params": False, + "fsdp_cpu_ram_efficient_loading": False, + "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", + "fsdp_state_dict_type": "SHARDED_STATE_DICT", + "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", + "fsdp_reshard_after_forward": fsdp_reshard_after_forward, }, - ], - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 4, - "gradient_accumulation_steps": 2, - "gradient_checkpointing": True, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_torch_8bit", - "lr_scheduler": "cosine", - "fsdp": [ - "auto_wrap", - ], - "fsdp_config": { - "fsdp_version": 2, - # "fsdp_forward_prefetch": True, # not yet implemented in accelerate - "fsdp_offload_params": False, - "fsdp_cpu_ram_efficient_loading": False, - "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_state_dict_type": "SHARDED_STATE_DICT", - "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", - "fsdp_reshard_after_forward": fsdp_reshard_after_forward, - }, - "use_tensorboard": True, - } + "use_tensorboard": True, + } + ) + | sft_prepared_dataset_alpaca_cfg ) if attention_backend == "flash": cfg.flash_attention = True @@ -543,64 +542,55 @@ class TestMultiGPULlama: temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" ) - def test_fsdp_qlora_prequant_packed(self, temp_dir): + def test_fsdp_qlora_prequant_packed( + self, temp_dir, sft_prepared_dataset_alpaca_cfg + ): # pylint: disable=duplicate-code - cfg = DictDefault( - { - "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16", - "adapter": "qlora", - "mean_resizing_embeddings": True, - "load_in_4bit": True, - "lora_r": 8, - "lora_alpha": 16, - "lora_dropout": 0.05, - "lora_target_linear": True, - # "lora_modules_to_save": [ - # "embed_tokens", - # "lm_head", - # ], - "sample_packing": True, - "eval_sample_packing": False, - "pad_to_sequence_len": True, - "sequence_len": 1024, - "val_set_size": 0.01, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:10%]", + cfg = ( + DictDefault( + { + "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16", + "adapter": "qlora", + "mean_resizing_embeddings": True, + "load_in_4bit": True, + "lora_r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_linear": True, + # "lora_modules_to_save": [ + # "embed_tokens", + # "lm_head", + # ], + "eval_sample_packing": False, + "pad_to_sequence_len": True, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 2, + "gradient_accumulation_steps": 2, + # "gradient_checkpointing": True, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_torch_fused", + "lr_scheduler": "cosine", + "flash_attention": True, + "fsdp": [ + "full_shard", + "auto_wrap", + ], + "fsdp_config": { + "fsdp_limit_all_gathers": True, + "fsdp_offload_params": False, + "fsdp_sync_module_states": True, + "fsdp_use_orig_params": False, + "fsdp_cpu_ram_efficient_loading": True, + "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", + "fsdp_state_dict_type": "SHARDED_STATE_DICT", + "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, - ], - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 2, - "gradient_accumulation_steps": 2, - # "gradient_checkpointing": True, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_torch_fused", - "lr_scheduler": "cosine", - "flash_attention": True, - "fsdp": [ - "full_shard", - "auto_wrap", - ], - "fsdp_config": { - "fsdp_limit_all_gathers": True, - "fsdp_offload_params": False, - "fsdp_sync_module_states": True, - "fsdp_use_orig_params": False, - "fsdp_cpu_ram_efficient_loading": True, - "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_state_dict_type": "SHARDED_STATE_DICT", - "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", - }, - "use_tensorboard": True, - } + "use_tensorboard": True, + } + ) + | sft_prepared_dataset_alpaca_cfg ) # write cfg to yaml file @@ -641,7 +631,12 @@ class TestMultiGPULlama: [True, False], ) def test_ds_zero3_packed( - self, temp_dir, gradient_accumulation_steps, deepspeed, qlora + self, + temp_dir, + sft_prepared_dataset_alpaca_cfg, + gradient_accumulation_steps, + deepspeed, + qlora, ): # pylint: disable=duplicate-code if qlora: @@ -655,37 +650,25 @@ class TestMultiGPULlama: } else: adapter = {} - cfg = DictDefault( - { - "base_model": "HuggingFaceTB/SmolLM2-135M", - "sample_packing": True, - "pad_to_sequence_len": True, - "sequence_len": 1024, - "val_set_size": 0.05, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:10%]", - }, - ], - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 1, - "gradient_accumulation_steps": gradient_accumulation_steps, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_torch_fused", - "lr_scheduler": "cosine", - "flash_attention": True, - "deepspeed": str(AXOLOTL_ROOT / deepspeed), - "use_tensorboard": True, - **adapter, - } + cfg = ( + DictDefault( + { + "pad_to_sequence_len": True, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 1, + "gradient_accumulation_steps": gradient_accumulation_steps, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_torch_fused", + "lr_scheduler": "cosine", + "flash_attention": True, + "deepspeed": str(AXOLOTL_ROOT / deepspeed), + "use_tensorboard": True, + **adapter, + } + ) + | sft_prepared_dataset_alpaca_cfg ) # write cfg to yaml file @@ -717,7 +700,13 @@ class TestMultiGPULlama: "qlora", [True, False], ) - def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora): + def test_ds_zero2_packed( + self, + temp_dir, + sft_prepared_dataset_alpaca_cfg, + gradient_accumulation_steps, + qlora, + ): # pylint: disable=duplicate-code if qlora: adapter = { @@ -730,37 +719,25 @@ class TestMultiGPULlama: } else: adapter = {} - cfg = DictDefault( - { - "base_model": "HuggingFaceTB/SmolLM2-135M", - "sample_packing": True, - "pad_to_sequence_len": True, - "sequence_len": 1024, - "val_set_size": 0.01, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:10%]", - }, - ], - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 1, - "gradient_accumulation_steps": gradient_accumulation_steps, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_torch_fused", - "lr_scheduler": "cosine", - "flash_attention": True, - "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"), - "use_tensorboard": True, - **adapter, - } + cfg = ( + DictDefault( + { + "pad_to_sequence_len": True, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 1, + "gradient_accumulation_steps": gradient_accumulation_steps, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_torch_fused", + "lr_scheduler": "cosine", + "flash_attention": True, + "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"), + "use_tensorboard": True, + **adapter, + } + ) + | sft_prepared_dataset_alpaca_cfg ) # write cfg to yaml file @@ -792,7 +769,13 @@ class TestMultiGPULlama: "qlora", [True, False], ) - def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora): + def test_ds_zero1_packed( + self, + temp_dir, + sft_prepared_dataset_alpaca_cfg, + gradient_accumulation_steps, + qlora, + ): # pylint: disable=duplicate-code if qlora: adapter = { @@ -805,37 +788,25 @@ class TestMultiGPULlama: } else: adapter = {} - cfg = DictDefault( - { - "base_model": "HuggingFaceTB/SmolLM2-135M", - "sample_packing": True, - "pad_to_sequence_len": True, - "sequence_len": 1024, - "val_set_size": 0.01, - "special_tokens": { - "pad_token": "<|endoftext|>", - }, - "datasets": [ - { - "path": "tatsu-lab/alpaca", - "type": "alpaca", - "split": "train[:10%]", - }, - ], - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 1, - "gradient_accumulation_steps": gradient_accumulation_steps, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_torch_fused", - "lr_scheduler": "cosine", - "flash_attention": True, - "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), - "use_tensorboard": True, - **adapter, - } + cfg = ( + DictDefault( + { + "pad_to_sequence_len": True, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 1, + "gradient_accumulation_steps": gradient_accumulation_steps, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_torch_fused", + "lr_scheduler": "cosine", + "flash_attention": True, + "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"), + "use_tensorboard": True, + **adapter, + } + ) + | sft_prepared_dataset_alpaca_cfg ) # write cfg to yaml file