use can_device_access_peer for P2P checks (#3209) [skip ci]
* use can_device_access_peer for P2P checks * also log warn when automatically setting NCCL_P2P_DISABLE=1
This commit is contained in:
@@ -3,66 +3,46 @@ utils to get GPU info for the current environment
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess # nosec B404
|
|
||||||
from importlib.metadata import version
|
from importlib.metadata import version
|
||||||
|
|
||||||
|
import torch
|
||||||
from accelerate.utils.environment import (
|
from accelerate.utils.environment import (
|
||||||
check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
|
check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
|
||||||
get_gpu_info,
|
|
||||||
)
|
)
|
||||||
from packaging.version import Version, parse
|
from packaging.version import Version, parse
|
||||||
|
|
||||||
|
from axolotl.utils.logging import get_logger
|
||||||
|
|
||||||
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def check_cuda_p2p_ib_support():
|
def check_cuda_p2p_ib_support():
|
||||||
if not accelerate_check_cuda_p2p_ib_support():
|
if not accelerate_check_cuda_p2p_ib_support():
|
||||||
return False
|
return False
|
||||||
if not check_runpod_p2p_support():
|
if not check_cuda_p2p_support():
|
||||||
return False
|
return False
|
||||||
unsupported_devices = {"RTX 6000 Ada", "L40S"}
|
|
||||||
try:
|
|
||||||
device_names, device_count = get_gpu_info()
|
|
||||||
if 1 < device_count < 8:
|
|
||||||
if any(
|
|
||||||
unsupported_device in device_name
|
|
||||||
for device_name in device_names
|
|
||||||
for unsupported_device in unsupported_devices
|
|
||||||
):
|
|
||||||
return False
|
|
||||||
except Exception: # nosec B110
|
|
||||||
pass
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def check_runpod_p2p_support() -> bool:
|
def check_cuda_p2p_support() -> bool:
|
||||||
if "RUNPOD_GPU_COUNT" not in os.environ:
|
|
||||||
return True
|
|
||||||
try:
|
try:
|
||||||
gpu_count = int(os.environ.get("RUNPOD_GPU_COUNT", "1"))
|
world_size = int(os.environ.get("WORLD_SIZE", "1"))
|
||||||
|
local_rank = int(os.environ.get("LOCAL_RANK", "0"))
|
||||||
except ValueError:
|
except ValueError:
|
||||||
return True
|
return True
|
||||||
if gpu_count >= 2:
|
|
||||||
# run `nvidia-smi topo -p2p n` and inspect the GPU0 row
|
if world_size > 1:
|
||||||
|
node_world_size = int(os.environ.get("NODE_WORLD_SIZE", "8"))
|
||||||
|
local_other_rank = (local_rank // node_world_size) * node_world_size
|
||||||
|
local_other_rank += 1 if (local_rank % node_world_size) == 0 else 0
|
||||||
try:
|
try:
|
||||||
result = subprocess.run( # nosec B603 B607
|
can_p2p = torch.cuda.can_device_access_peer(local_rank, local_other_rank)
|
||||||
["nvidia-smi", "topo", "-p2p", "n"],
|
except AssertionError as exc:
|
||||||
check=True,
|
# some sort of logic error in indexing processes, assume p2p is fine for now
|
||||||
capture_output=True,
|
LOG.warning(exc)
|
||||||
text=True,
|
|
||||||
timeout=5,
|
|
||||||
)
|
|
||||||
except (
|
|
||||||
subprocess.CalledProcessError,
|
|
||||||
FileNotFoundError,
|
|
||||||
subprocess.TimeoutExpired,
|
|
||||||
):
|
|
||||||
return True # fail-open if detection fails
|
|
||||||
output_lines = result.stdout.strip().split("\n")
|
|
||||||
# filter rows that start with "GPU0" (avoid header row)
|
|
||||||
gpu0_rows = [line for line in output_lines if line.lstrip().startswith("GPU0")]
|
|
||||||
if not gpu0_rows:
|
|
||||||
return True
|
return True
|
||||||
# consider P2P supported if any OK is present in the GPU0 row
|
return can_p2p
|
||||||
return "OK" in gpu0_rows[-1]
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -641,6 +641,7 @@ def setup_parallelism_envs(cfg):
|
|||||||
def prepare_optim_env(cfg):
|
def prepare_optim_env(cfg):
|
||||||
if not check_cuda_p2p_ib_support():
|
if not check_cuda_p2p_ib_support():
|
||||||
if os.getenv("NCCL_P2P_DISABLE") is None:
|
if os.getenv("NCCL_P2P_DISABLE") is None:
|
||||||
|
LOG.warning("P2P support not detected, setting `NCCL_P2P_DISABLE=1`")
|
||||||
os.environ["NCCL_P2P_DISABLE"] = "1"
|
os.environ["NCCL_P2P_DISABLE"] = "1"
|
||||||
# TODO @SalmanMohammadi remove the cfg.fsdp check in 0.12
|
# TODO @SalmanMohammadi remove the cfg.fsdp check in 0.12
|
||||||
if cfg.fsdp or cfg.fsdp_config:
|
if cfg.fsdp or cfg.fsdp_config:
|
||||||
|
|||||||
Reference in New Issue
Block a user