Compare commits

..

10 Commits

Author SHA1 Message Date
Sung Ching Liu
f68aedd1f8 Update __init__.py 2025-02-26 00:21:16 -05:00
Sunny Liu
3dd5c6f8ec nit 2025-02-26 00:21:16 -05:00
Sunny Liu
4caa59a087 auto detect tp_size 2025-02-26 00:21:16 -05:00
Sunny Liu
984be14147 add tp_size in config doc 2025-02-26 00:21:16 -05:00
Sunny Liu
64adbf1a15 tp plan not needed 2025-02-26 00:21:16 -05:00
Sunny Liu
438b623031 prepare accelerate envs for tp 2025-02-26 00:21:16 -05:00
Sunny Liu
a74efcecbe skip move to device 2025-02-26 00:21:16 -05:00
Sunny Liu
d663652216 del device_map for tp 2025-02-26 00:21:16 -05:00
Sunny Liu
dbd43aa18f set tp_plan 2025-02-26 00:21:16 -05:00
Sunny Liu
dbdf97e828 enabe tp thru tp_size 2025-02-26 00:21:16 -05:00
17 changed files with 60 additions and 26 deletions

View File

@@ -78,6 +78,9 @@ tf32: true # require >=ampere
bfloat16: true # require >=ampere bfloat16: true # require >=ampere
float16: true float16: true
# Use Tensor parallel
tensor_parallel: true # require multi-gGPU
# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset # Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
gpu_memory_limit: 20GiB gpu_memory_limit: 20GiB
# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge # Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge

View File

@@ -703,6 +703,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
"accelerator_config" "accelerator_config"
] = self.cfg.accelerator_config ] = self.cfg.accelerator_config
if self.cfg.tensor_parallel:
training_arguments_kwargs["tp_size"] = torch.cuda.device_count()
if self.cfg.kd_ce_alpha is not None: if self.cfg.kd_ce_alpha is not None:
training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
if self.cfg.kd_alpha is not None: if self.cfg.kd_alpha is not None:

View File

@@ -748,6 +748,8 @@ class AxolotlInputConfig(
local_rank: Optional[int] = None local_rank: Optional[int] = None
ddp: Optional[bool] = None ddp: Optional[bool] = None
tensor_parallel: Optional[bool] = None
seed: Optional[int] = None seed: Optional[int] = None
ddp_timeout: Optional[int] = None ddp_timeout: Optional[int] = None
ddp_bucket_cap_mb: Optional[int] = None ddp_bucket_cap_mb: Optional[int] = None
@@ -1371,6 +1373,13 @@ class AxolotlInputConfig(
) )
return data return data
@model_validator(mode="before")
@classmethod
def check_fsdp_tp(cls, data):
if data.get("fsdp") and data.get("tensor_parallel"):
raise ValueError("FSDP with tensor parallelism is not supported yet.")
return data
@model_validator(mode="after") @model_validator(mode="after")
def check_fft_possible_bad_config(self): def check_fft_possible_bad_config(self):
if ( if (

View File

@@ -762,6 +762,9 @@ class ModelLoader:
return hf_ds_cfg return hf_ds_cfg
skip_move_to_device = False skip_move_to_device = False
if self.cfg.tensor_parallel:
del self.model_kwargs["device_map"]
if ( # pylint: disable=condition-evals-to-constant) if ( # pylint: disable=condition-evals-to-constant)
(self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading) (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
and not qlora_fsdp and not qlora_fsdp

View File

@@ -547,6 +547,7 @@ def prepare_optim_env(cfg):
if not check_cuda_p2p_ib_support(): if not check_cuda_p2p_ib_support():
if os.getenv("NCCL_P2P_DISABLE") is None: if os.getenv("NCCL_P2P_DISABLE") is None:
os.environ["NCCL_P2P_DISABLE"] = "1" os.environ["NCCL_P2P_DISABLE"] = "1"
if cfg.fsdp: if cfg.fsdp:
setup_fsdp_envs(cfg) setup_fsdp_envs(cfg)
elif cfg.deepspeed: elif cfg.deepspeed:

View File

@@ -47,9 +47,9 @@ def download_smollm2_135m_model():
@pytest.fixture(scope="session", autouse=True) @pytest.fixture(scope="session", autouse=True)
def download_smollm2_135m_instruct_model(): def download_llama_68m_random_model():
# download the model # download the model
snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M-Instruct") snapshot_download_w_retry("JackFram/llama-68m")
@pytest.fixture(scope="session", autouse=True) @pytest.fixture(scope="session", autouse=True)

View File

@@ -28,7 +28,7 @@ class Test4dMultipackLlama(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"flash_attention": False, "flash_attention": False,
"sdp_attention": True, "sdp_attention": True,
"sample_packing": True, "sample_packing": True,
@@ -72,7 +72,7 @@ class Test4dMultipackLlama(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"flash_attention": False, "flash_attention": False,
"sdp_attention": False, "sdp_attention": False,
"sample_packing": True, "sample_packing": True,

View File

@@ -32,7 +32,7 @@ class TestFusedLlama(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"flash_attention": True, "flash_attention": True,
"pad_to_sequence_len": True, "pad_to_sequence_len": True,
"flash_attn_fuse_qkv": True, "flash_attn_fuse_qkv": True,

View File

@@ -31,7 +31,8 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 16384, "sequence_len": 16384,
"sample_packing": False, "sample_packing": False,
"flash_attention": True, "flash_attention": True,
@@ -76,7 +77,8 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 16384, "sequence_len": 16384,
"sample_packing": False, "sample_packing": False,
"flash_attention": True, "flash_attention": True,

View File

@@ -31,7 +31,8 @@ class TestLoraLlama(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"sample_packing": True, "sample_packing": True,
"flash_attention": True, "flash_attention": True,
@@ -42,7 +43,6 @@ class TestLoraLlama(unittest.TestCase):
"lora_dropout": 0.05, "lora_dropout": 0.05,
"lora_target_linear": True, "lora_target_linear": True,
"val_set_size": 0.2, "val_set_size": 0.2,
"lora_modules_to_save": ["lm_head", "embed_tokens"],
"special_tokens": { "special_tokens": {
"unk_token": "<unk>", "unk_token": "<unk>",
"bos_token": "<s>", "bos_token": "<s>",

View File

@@ -31,7 +31,8 @@ class TestDPOLlamaLora(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -76,7 +77,8 @@ class TestDPOLlamaLora(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -122,7 +124,8 @@ class TestDPOLlamaLora(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -169,7 +172,8 @@ class TestDPOLlamaLora(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -214,7 +218,8 @@ class TestDPOLlamaLora(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -259,7 +264,8 @@ class TestDPOLlamaLora(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -308,7 +314,8 @@ class TestDPOLlamaLora(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",

View File

@@ -26,7 +26,8 @@ class TestLlama:
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"trust_remote_code": True, "trust_remote_code": True,
"sequence_len": 512, "sequence_len": 512,
"val_set_size": 0.1, "val_set_size": 0.1,

View File

@@ -26,8 +26,9 @@ class TestLoadModelUtils:
# load config # load config
self.cfg = DictDefault( self.cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_config": "HuggingFaceTB/SmolLM2-135M-Instruct", "tokenizer_type": "LlamaTokenizer",
"tokenizer_config": "JackFram/llama-68m",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": False, "load_in_8bit": False,
"adapter": "lora", "adapter": "lora",

View File

@@ -28,7 +28,8 @@ class TestLoraLlama(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -36,7 +37,6 @@ class TestLoraLlama(unittest.TestCase):
"lora_alpha": 16, "lora_alpha": 16,
"lora_dropout": 0.05, "lora_dropout": 0.05,
"lora_target_linear": True, "lora_target_linear": True,
"lora_modules_to_save": ["lm_head", "embed_tokens"],
"val_set_size": 0.1, "val_set_size": 0.1,
"special_tokens": { "special_tokens": {
"unk_token": "<unk>", "unk_token": "<unk>",

View File

@@ -28,7 +28,8 @@ class TestCustomOptimizers(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",
@@ -73,7 +74,8 @@ class TestCustomOptimizers(unittest.TestCase):
# pylint: disable=duplicate-code # pylint: disable=duplicate-code
cfg = DictDefault( cfg = DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024, "sequence_len": 1024,
"load_in_8bit": True, "load_in_8bit": True,
"adapter": "lora", "adapter": "lora",

View File

@@ -16,8 +16,9 @@ class NormalizeConfigTestCase(unittest.TestCase):
def _get_base_cfg(self): def _get_base_cfg(self):
return DictDefault( return DictDefault(
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"base_model_config": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model_config": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"num_epochs": 1, "num_epochs": 1,
"micro_batch_size": 1, "micro_batch_size": 1,
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,

View File

@@ -18,8 +18,9 @@ class TestModelsUtils:
# load config # load config
self.cfg = DictDefault( # pylint: disable=attribute-defined-outside-init self.cfg = DictDefault( # pylint: disable=attribute-defined-outside-init
{ {
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct", "base_model": "JackFram/llama-68m",
"model_type": "LlamaForCausalLM", "model_type": "LlamaForCausalLM",
"tokenizer_type": "LlamaTokenizer",
"load_in_8bit": True, "load_in_8bit": True,
"load_in_4bit": False, "load_in_4bit": False,
"adapter": "lora", "adapter": "lora",