Compare commits
10 Commits
diffusion-
...
tp_support
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f68aedd1f8 | ||
|
|
3dd5c6f8ec | ||
|
|
4caa59a087 | ||
|
|
984be14147 | ||
|
|
64adbf1a15 | ||
|
|
438b623031 | ||
|
|
a74efcecbe | ||
|
|
d663652216 | ||
|
|
dbd43aa18f | ||
|
|
dbdf97e828 |
@@ -78,6 +78,9 @@ tf32: true # require >=ampere
|
||||
bfloat16: true # require >=ampere
|
||||
float16: true
|
||||
|
||||
# Use Tensor parallel
|
||||
tensor_parallel: true # require multi-gGPU
|
||||
|
||||
# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
|
||||
gpu_memory_limit: 20GiB
|
||||
# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
|
||||
|
||||
@@ -703,6 +703,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
"accelerator_config"
|
||||
] = self.cfg.accelerator_config
|
||||
|
||||
if self.cfg.tensor_parallel:
|
||||
training_arguments_kwargs["tp_size"] = torch.cuda.device_count()
|
||||
|
||||
if self.cfg.kd_ce_alpha is not None:
|
||||
training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
|
||||
if self.cfg.kd_alpha is not None:
|
||||
|
||||
@@ -748,6 +748,8 @@ class AxolotlInputConfig(
|
||||
local_rank: Optional[int] = None
|
||||
ddp: Optional[bool] = None
|
||||
|
||||
tensor_parallel: Optional[bool] = None
|
||||
|
||||
seed: Optional[int] = None
|
||||
ddp_timeout: Optional[int] = None
|
||||
ddp_bucket_cap_mb: Optional[int] = None
|
||||
@@ -1371,6 +1373,13 @@ class AxolotlInputConfig(
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_fsdp_tp(cls, data):
|
||||
if data.get("fsdp") and data.get("tensor_parallel"):
|
||||
raise ValueError("FSDP with tensor parallelism is not supported yet.")
|
||||
return data
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check_fft_possible_bad_config(self):
|
||||
if (
|
||||
|
||||
@@ -762,6 +762,9 @@ class ModelLoader:
|
||||
return hf_ds_cfg
|
||||
|
||||
skip_move_to_device = False
|
||||
if self.cfg.tensor_parallel:
|
||||
del self.model_kwargs["device_map"]
|
||||
|
||||
if ( # pylint: disable=condition-evals-to-constant)
|
||||
(self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
|
||||
and not qlora_fsdp
|
||||
|
||||
@@ -547,6 +547,7 @@ def prepare_optim_env(cfg):
|
||||
if not check_cuda_p2p_ib_support():
|
||||
if os.getenv("NCCL_P2P_DISABLE") is None:
|
||||
os.environ["NCCL_P2P_DISABLE"] = "1"
|
||||
|
||||
if cfg.fsdp:
|
||||
setup_fsdp_envs(cfg)
|
||||
elif cfg.deepspeed:
|
||||
|
||||
Reference in New Issue
Block a user