From 0102ca5943eefa5c374e38bc9eab2510f577b476 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 6 Jul 2025 19:11:46 -0400 Subject: [PATCH] fix cfg merge --- tests/conftest.py | 7 +--- tests/e2e/multigpu/test_llama.py | 64 ++++++++++++++++---------------- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 51b5b1049..23012cfc3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,7 +10,7 @@ import shutil import sys import tempfile import time -from pathlib import Path, PosixPath +from pathlib import Path from typing import Generator import datasets @@ -432,11 +432,6 @@ def module_temp_dir() -> Generator[str, None, None]: shutil.rmtree(_temp_dir) -@pytest.fixture(scope="function", autouse=True) -def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None: - os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache" - - @pytest.fixture(scope="function", autouse=True) def cleanup_monkeypatches(): from transformers import Trainer diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 4ced79632..30e7446b1 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -32,7 +32,7 @@ def sft_base_cfg(): cfg = DictDefault( base_model="HuggingFaceTB/SmolLM2-135M", tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation - sequence_len=2048, + sequence_len=1024, special_tokens={ "pad_token": "<|endoftext|>", }, @@ -442,36 +442,38 @@ class TestMultiGPULlama: self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type ): # pylint: disable=duplicate-code - cfg = DictDefault( - { - "pad_to_sequence_len": True, - "num_epochs": 1, - "max_steps": 2, - "micro_batch_size": 2, - "gradient_accumulation_steps": 2, - # "gradient_checkpointing": True, - "output_dir": temp_dir, - "dataset_prepared_path": temp_dir + "/last_run_prepared", - "learning_rate": 0.00001, - "optimizer": "adamw_torch_fused", - "lr_scheduler": "cosine", - "flash_attention": True, - "fsdp": [ - "full_shard", - "auto_wrap", - ], - "fsdp_config": { - "fsdp_limit_all_gathers": True, - "fsdp_offload_params": False, - "fsdp_sync_module_states": True, - "fsdp_use_orig_params": False, - "fsdp_cpu_ram_efficient_loading": False, - "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_state_dict_type": fsdp_state_dict_type, - "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", - }, - "use_tensorboard": True, - } + cfg = ( + DictDefault( + { + "pad_to_sequence_len": True, + "num_epochs": 1, + "max_steps": 2, + "micro_batch_size": 2, + "gradient_accumulation_steps": 2, + # "gradient_checkpointing": True, + "output_dir": temp_dir, + "dataset_prepared_path": temp_dir + "/last_run_prepared", + "learning_rate": 0.00001, + "optimizer": "adamw_torch_fused", + "lr_scheduler": "cosine", + "flash_attention": True, + "fsdp": [ + "full_shard", + "auto_wrap", + ], + "fsdp_config": { + "fsdp_limit_all_gathers": True, + "fsdp_offload_params": False, + "fsdp_sync_module_states": True, + "fsdp_use_orig_params": False, + "fsdp_cpu_ram_efficient_loading": False, + "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", + "fsdp_state_dict_type": fsdp_state_dict_type, + "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", + }, + "use_tensorboard": True, + } + ) | sft_prepared_dataset_alpaca_cfg )