fix cfg merge

This commit is contained in:
Wing Lian
2025-07-06 19:11:46 -04:00
parent 97e8c01a70
commit 0102ca5943
2 changed files with 34 additions and 37 deletions

View File

@@ -10,7 +10,7 @@ import shutil
import sys import sys
import tempfile import tempfile
import time import time
from pathlib import Path, PosixPath from pathlib import Path
from typing import Generator from typing import Generator
import datasets import datasets
@@ -432,11 +432,6 @@ def module_temp_dir() -> Generator[str, None, None]:
shutil.rmtree(_temp_dir) shutil.rmtree(_temp_dir)
@pytest.fixture(scope="function", autouse=True)
def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
@pytest.fixture(scope="function", autouse=True) @pytest.fixture(scope="function", autouse=True)
def cleanup_monkeypatches(): def cleanup_monkeypatches():
from transformers import Trainer from transformers import Trainer

View File

@@ -32,7 +32,7 @@ def sft_base_cfg():
cfg = DictDefault( cfg = DictDefault(
base_model="HuggingFaceTB/SmolLM2-135M", base_model="HuggingFaceTB/SmolLM2-135M",
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
sequence_len=2048, sequence_len=1024,
special_tokens={ special_tokens={
"pad_token": "<|endoftext|>", "pad_token": "<|endoftext|>",
}, },
@@ -442,36 +442,38 @@ class TestMultiGPULlama:
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
): ):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = (
{ DictDefault(
"pad_to_sequence_len": True, {
"num_epochs": 1, "pad_to_sequence_len": True,
"max_steps": 2, "num_epochs": 1,
"micro_batch_size": 2, "max_steps": 2,
"gradient_accumulation_steps": 2, "micro_batch_size": 2,
# "gradient_checkpointing": True, "gradient_accumulation_steps": 2,
"output_dir": temp_dir, # "gradient_checkpointing": True,
"dataset_prepared_path": temp_dir + "/last_run_prepared", "output_dir": temp_dir,
"learning_rate": 0.00001, "dataset_prepared_path": temp_dir + "/last_run_prepared",
"optimizer": "adamw_torch_fused", "learning_rate": 0.00001,
"lr_scheduler": "cosine", "optimizer": "adamw_torch_fused",
"flash_attention": True, "lr_scheduler": "cosine",
"fsdp": [ "flash_attention": True,
"full_shard", "fsdp": [
"auto_wrap", "full_shard",
], "auto_wrap",
"fsdp_config": { ],
"fsdp_limit_all_gathers": True, "fsdp_config": {
"fsdp_offload_params": False, "fsdp_limit_all_gathers": True,
"fsdp_sync_module_states": True, "fsdp_offload_params": False,
"fsdp_use_orig_params": False, "fsdp_sync_module_states": True,
"fsdp_cpu_ram_efficient_loading": False, "fsdp_use_orig_params": False,
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", "fsdp_cpu_ram_efficient_loading": False,
"fsdp_state_dict_type": fsdp_state_dict_type, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", "fsdp_state_dict_type": fsdp_state_dict_type,
}, "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
"use_tensorboard": True, },
} "use_tensorboard": True,
}
)
| sft_prepared_dataset_alpaca_cfg | sft_prepared_dataset_alpaca_cfg
) )