Compare commits
10 Commits
fix/replac
...
tp_support
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f68aedd1f8 | ||
|
|
3dd5c6f8ec | ||
|
|
4caa59a087 | ||
|
|
984be14147 | ||
|
|
64adbf1a15 | ||
|
|
438b623031 | ||
|
|
a74efcecbe | ||
|
|
d663652216 | ||
|
|
dbd43aa18f | ||
|
|
dbdf97e828 |
@@ -78,6 +78,9 @@ tf32: true # require >=ampere
|
|||||||
bfloat16: true # require >=ampere
|
bfloat16: true # require >=ampere
|
||||||
float16: true
|
float16: true
|
||||||
|
|
||||||
|
# Use Tensor parallel
|
||||||
|
tensor_parallel: true # require multi-gGPU
|
||||||
|
|
||||||
# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
|
# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
|
||||||
gpu_memory_limit: 20GiB
|
gpu_memory_limit: 20GiB
|
||||||
# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
|
# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
|
||||||
|
|||||||
@@ -703,6 +703,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
"accelerator_config"
|
"accelerator_config"
|
||||||
] = self.cfg.accelerator_config
|
] = self.cfg.accelerator_config
|
||||||
|
|
||||||
|
if self.cfg.tensor_parallel:
|
||||||
|
training_arguments_kwargs["tp_size"] = torch.cuda.device_count()
|
||||||
|
|
||||||
if self.cfg.kd_ce_alpha is not None:
|
if self.cfg.kd_ce_alpha is not None:
|
||||||
training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
|
training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
|
||||||
if self.cfg.kd_alpha is not None:
|
if self.cfg.kd_alpha is not None:
|
||||||
|
|||||||
@@ -748,6 +748,8 @@ class AxolotlInputConfig(
|
|||||||
local_rank: Optional[int] = None
|
local_rank: Optional[int] = None
|
||||||
ddp: Optional[bool] = None
|
ddp: Optional[bool] = None
|
||||||
|
|
||||||
|
tensor_parallel: Optional[bool] = None
|
||||||
|
|
||||||
seed: Optional[int] = None
|
seed: Optional[int] = None
|
||||||
ddp_timeout: Optional[int] = None
|
ddp_timeout: Optional[int] = None
|
||||||
ddp_bucket_cap_mb: Optional[int] = None
|
ddp_bucket_cap_mb: Optional[int] = None
|
||||||
@@ -1371,6 +1373,13 @@ class AxolotlInputConfig(
|
|||||||
)
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@model_validator(mode="before")
|
||||||
|
@classmethod
|
||||||
|
def check_fsdp_tp(cls, data):
|
||||||
|
if data.get("fsdp") and data.get("tensor_parallel"):
|
||||||
|
raise ValueError("FSDP with tensor parallelism is not supported yet.")
|
||||||
|
return data
|
||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
def check_fft_possible_bad_config(self):
|
def check_fft_possible_bad_config(self):
|
||||||
if (
|
if (
|
||||||
|
|||||||
@@ -762,6 +762,9 @@ class ModelLoader:
|
|||||||
return hf_ds_cfg
|
return hf_ds_cfg
|
||||||
|
|
||||||
skip_move_to_device = False
|
skip_move_to_device = False
|
||||||
|
if self.cfg.tensor_parallel:
|
||||||
|
del self.model_kwargs["device_map"]
|
||||||
|
|
||||||
if ( # pylint: disable=condition-evals-to-constant)
|
if ( # pylint: disable=condition-evals-to-constant)
|
||||||
(self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
|
(self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
|
||||||
and not qlora_fsdp
|
and not qlora_fsdp
|
||||||
|
|||||||
@@ -547,6 +547,7 @@ def prepare_optim_env(cfg):
|
|||||||
if not check_cuda_p2p_ib_support():
|
if not check_cuda_p2p_ib_support():
|
||||||
if os.getenv("NCCL_P2P_DISABLE") is None:
|
if os.getenv("NCCL_P2P_DISABLE") is None:
|
||||||
os.environ["NCCL_P2P_DISABLE"] = "1"
|
os.environ["NCCL_P2P_DISABLE"] = "1"
|
||||||
|
|
||||||
if cfg.fsdp:
|
if cfg.fsdp:
|
||||||
setup_fsdp_envs(cfg)
|
setup_fsdp_envs(cfg)
|
||||||
elif cfg.deepspeed:
|
elif cfg.deepspeed:
|
||||||
|
|||||||
@@ -47,9 +47,9 @@ def download_smollm2_135m_model():
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session", autouse=True)
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
def download_smollm2_135m_instruct_model():
|
def download_llama_68m_random_model():
|
||||||
# download the model
|
# download the model
|
||||||
snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M-Instruct")
|
snapshot_download_w_retry("JackFram/llama-68m")
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session", autouse=True)
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ class Test4dMultipackLlama(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
"flash_attention": False,
|
"flash_attention": False,
|
||||||
"sdp_attention": True,
|
"sdp_attention": True,
|
||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
@@ -72,7 +72,7 @@ class Test4dMultipackLlama(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
"flash_attention": False,
|
"flash_attention": False,
|
||||||
"sdp_attention": False,
|
"sdp_attention": False,
|
||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class TestFusedLlama(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"pad_to_sequence_len": True,
|
"pad_to_sequence_len": True,
|
||||||
"flash_attn_fuse_qkv": True,
|
"flash_attn_fuse_qkv": True,
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 16384,
|
"sequence_len": 16384,
|
||||||
"sample_packing": False,
|
"sample_packing": False,
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
@@ -76,7 +77,8 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 16384,
|
"sequence_len": 16384,
|
||||||
"sample_packing": False,
|
"sample_packing": False,
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ class TestLoraLlama(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
@@ -42,7 +43,6 @@ class TestLoraLlama(unittest.TestCase):
|
|||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"val_set_size": 0.2,
|
"val_set_size": 0.2,
|
||||||
"lora_modules_to_save": ["lm_head", "embed_tokens"],
|
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
"bos_token": "<s>",
|
"bos_token": "<s>",
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ class TestDPOLlamaLora(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -76,7 +77,8 @@ class TestDPOLlamaLora(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -122,7 +124,8 @@ class TestDPOLlamaLora(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -169,7 +172,8 @@ class TestDPOLlamaLora(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -214,7 +218,8 @@ class TestDPOLlamaLora(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -259,7 +264,8 @@ class TestDPOLlamaLora(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -308,7 +314,8 @@ class TestDPOLlamaLora(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
|
|||||||
@@ -26,7 +26,8 @@ class TestLlama:
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"trust_remote_code": True,
|
"trust_remote_code": True,
|
||||||
"sequence_len": 512,
|
"sequence_len": 512,
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.1,
|
||||||
|
|||||||
@@ -26,8 +26,9 @@ class TestLoadModelUtils:
|
|||||||
# load config
|
# load config
|
||||||
self.cfg = DictDefault(
|
self.cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
"tokenizer_config": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
|
"tokenizer_config": "JackFram/llama-68m",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": False,
|
"load_in_8bit": False,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
|
|||||||
@@ -28,7 +28,8 @@ class TestLoraLlama(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -36,7 +37,6 @@ class TestLoraLlama(unittest.TestCase):
|
|||||||
"lora_alpha": 16,
|
"lora_alpha": 16,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
"lora_target_linear": True,
|
"lora_target_linear": True,
|
||||||
"lora_modules_to_save": ["lm_head", "embed_tokens"],
|
|
||||||
"val_set_size": 0.1,
|
"val_set_size": 0.1,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"unk_token": "<unk>",
|
"unk_token": "<unk>",
|
||||||
|
|||||||
@@ -28,7 +28,8 @@ class TestCustomOptimizers(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
@@ -73,7 +74,8 @@ class TestCustomOptimizers(unittest.TestCase):
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"sequence_len": 1024,
|
"sequence_len": 1024,
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
|
|||||||
@@ -16,8 +16,9 @@ class NormalizeConfigTestCase(unittest.TestCase):
|
|||||||
def _get_base_cfg(self):
|
def _get_base_cfg(self):
|
||||||
return DictDefault(
|
return DictDefault(
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
"base_model_config": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model_config": "JackFram/llama-68m",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"micro_batch_size": 1,
|
"micro_batch_size": 1,
|
||||||
"gradient_accumulation_steps": 1,
|
"gradient_accumulation_steps": 1,
|
||||||
|
|||||||
@@ -18,8 +18,9 @@ class TestModelsUtils:
|
|||||||
# load config
|
# load config
|
||||||
self.cfg = DictDefault( # pylint: disable=attribute-defined-outside-init
|
self.cfg = DictDefault( # pylint: disable=attribute-defined-outside-init
|
||||||
{
|
{
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
|
"base_model": "JackFram/llama-68m",
|
||||||
"model_type": "LlamaForCausalLM",
|
"model_type": "LlamaForCausalLM",
|
||||||
|
"tokenizer_type": "LlamaTokenizer",
|
||||||
"load_in_8bit": True,
|
"load_in_8bit": True,
|
||||||
"load_in_4bit": False,
|
"load_in_4bit": False,
|
||||||
"adapter": "lora",
|
"adapter": "lora",
|
||||||
|
|||||||
Reference in New Issue
Block a user