add additional packages via apt for better multi-node support (#2949)

* cleanup in Dockerfile and add infiniband packages

* fixes for ci

* fix nightly too
This commit is contained in:
Wing Lian
2025-07-20 21:19:23 -04:00
committed by GitHub
parent b986f7c7cb
commit 31a15a49b6
6 changed files with 26 additions and 20 deletions

View File

@@ -14,7 +14,8 @@ COPY scripts/motd /etc/motd
RUN pip install jupyterlab notebook ipywidgets && \
jupyter lab clean
RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
rm -rf /var/lib/apt/lists/* && \
mkdir -p ~/.ssh && \
chmod 700 ~/.ssh && \
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \