fix cfg merge
This commit is contained in:
@@ -10,7 +10,7 @@ import shutil
|
|||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
from pathlib import Path, PosixPath
|
from pathlib import Path
|
||||||
from typing import Generator
|
from typing import Generator
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@@ -432,11 +432,6 @@ def module_temp_dir() -> Generator[str, None, None]:
|
|||||||
shutil.rmtree(_temp_dir)
|
shutil.rmtree(_temp_dir)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
|
||||||
def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
|
|
||||||
os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
def cleanup_monkeypatches():
|
def cleanup_monkeypatches():
|
||||||
from transformers import Trainer
|
from transformers import Trainer
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ def sft_base_cfg():
|
|||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
base_model="HuggingFaceTB/SmolLM2-135M",
|
base_model="HuggingFaceTB/SmolLM2-135M",
|
||||||
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
|
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
|
||||||
sequence_len=2048,
|
sequence_len=1024,
|
||||||
special_tokens={
|
special_tokens={
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
@@ -442,36 +442,38 @@ class TestMultiGPULlama:
|
|||||||
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
|
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
|
||||||
):
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"pad_to_sequence_len": True,
|
{
|
||||||
"num_epochs": 1,
|
"pad_to_sequence_len": True,
|
||||||
"max_steps": 2,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 2,
|
"max_steps": 2,
|
||||||
"gradient_accumulation_steps": 2,
|
"micro_batch_size": 2,
|
||||||
# "gradient_checkpointing": True,
|
"gradient_accumulation_steps": 2,
|
||||||
"output_dir": temp_dir,
|
# "gradient_checkpointing": True,
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
"output_dir": temp_dir,
|
||||||
"learning_rate": 0.00001,
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
"optimizer": "adamw_torch_fused",
|
"learning_rate": 0.00001,
|
||||||
"lr_scheduler": "cosine",
|
"optimizer": "adamw_torch_fused",
|
||||||
"flash_attention": True,
|
"lr_scheduler": "cosine",
|
||||||
"fsdp": [
|
"flash_attention": True,
|
||||||
"full_shard",
|
"fsdp": [
|
||||||
"auto_wrap",
|
"full_shard",
|
||||||
],
|
"auto_wrap",
|
||||||
"fsdp_config": {
|
],
|
||||||
"fsdp_limit_all_gathers": True,
|
"fsdp_config": {
|
||||||
"fsdp_offload_params": False,
|
"fsdp_limit_all_gathers": True,
|
||||||
"fsdp_sync_module_states": True,
|
"fsdp_offload_params": False,
|
||||||
"fsdp_use_orig_params": False,
|
"fsdp_sync_module_states": True,
|
||||||
"fsdp_cpu_ram_efficient_loading": False,
|
"fsdp_use_orig_params": False,
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
"fsdp_cpu_ram_efficient_loading": False,
|
||||||
"fsdp_state_dict_type": fsdp_state_dict_type,
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
"fsdp_state_dict_type": fsdp_state_dict_type,
|
||||||
},
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
"use_tensorboard": True,
|
},
|
||||||
}
|
"use_tensorboard": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
| sft_prepared_dataset_alpaca_cfg
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user