From 31a15a49b66a19e69819af17e694126dd76974c3 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sun, 20 Jul 2025 21:19:23 -0400 Subject: [PATCH] add additional packages via apt for better multi-node support (#2949) * cleanup in Dockerfile and add infiniband packages * fixes for ci * fix nightly too --- cicd/Dockerfile-uv.jinja | 2 +- cicd/Dockerfile.jinja | 2 +- docker/Dockerfile | 21 +++++++++------------ docker/Dockerfile-base | 8 ++++++-- docker/Dockerfile-base-nightly | 10 +++++++--- docker/Dockerfile-cloud | 3 ++- 6 files changed, 26 insertions(+), 20 deletions(-) diff --git a/cicd/Dockerfile-uv.jinja b/cicd/Dockerfile-uv.jinja index 84527274d..860386187 100644 --- a/cicd/Dockerfile-uv.jinja +++ b/cicd/Dockerfile-uv.jinja @@ -11,7 +11,7 @@ ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}" ENV HF_HOME="{{ HF_HOME }}" RUN apt-get update && \ - apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev + apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm WORKDIR /workspace diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja index 13920de78..94c9a67e3 100644 --- a/cicd/Dockerfile.jinja +++ b/cicd/Dockerfile.jinja @@ -12,7 +12,7 @@ ENV HF_HOME="{{ HF_HOME }}" ENV AXOLOTL_DATASET_PROCESSES="8" RUN apt-get update && \ - apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev + apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm WORKDIR /workspace diff --git a/docker/Dockerfile b/docker/Dockerfile index e23a729d4..7114fd104 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -10,7 +10,8 @@ ARG PYTORCH_VERSION="2.1.2" ENV PYTORCH_VERSION=$PYTORCH_VERSION RUN apt-get update && \ - apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs + apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \ + rm -rf /var/lib/apt/lists/* WORKDIR /workspace @@ -23,17 +24,13 @@ RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \ - fi - -RUN python scripts/unsloth_install.py | sh -RUN python scripts/cutcrossentropy_install.py | sh - -# So we can test the Docker image -RUN pip install pytest + fi && \ + python scripts/unsloth_install.py | sh && \ + python scripts/cutcrossentropy_install.py | sh && \ + pip install pytest && \ + pip cache purge # fix so that git fetch/pull from remote works RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ - git config --get remote.origin.fetch - -# helper for huggingface-login cli -RUN git config --global credential.helper store + git config --get remote.origin.fetch && \ + git config --global credential.helper store diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base index df4240325..4c301932d 100644 --- a/docker/Dockerfile-base +++ b/docker/Dockerfile-base @@ -22,6 +22,8 @@ RUN apt-get update \ && mkdir /root/.conda \ && bash Miniconda3-latest-Linux-x86_64.sh -b \ && rm -f Miniconda3-latest-Linux-x86_64.sh \ + && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \ + && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \ && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" @@ -31,12 +33,14 @@ WORKDIR /workspace RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \ python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \ python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \ - python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" + python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \ + python3 -m pip cache purge RUN git lfs install --skip-repo && \ pip3 install awscli && \ # The base image ships with `pydantic==1.8.2` which is not working - pip3 install -U --no-cache-dir pydantic==1.10.10 + pip3 install -U --no-cache-dir pydantic==1.10.10 && \ + pip3 cache purge RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \ FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \ diff --git a/docker/Dockerfile-base-nightly b/docker/Dockerfile-base-nightly index 85805ea41..cc74e6bb9 100644 --- a/docker/Dockerfile-base-nightly +++ b/docker/Dockerfile-base-nightly @@ -22,18 +22,22 @@ RUN apt-get update \ && mkdir /root/.conda \ && bash Miniconda3-latest-Linux-x86_64.sh -b \ && rm -f Miniconda3-latest-Linux-x86_64.sh \ + && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \ + && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \ && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" WORKDIR /workspace -RUN python3 -m pip install --upgrade pip && pip3 install packaging && \ +RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \ python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \ python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \ - python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" + python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \ + python3 -m pip cache purge RUN git lfs install --skip-repo && \ pip3 install awscli && \ # The base image ships with `pydantic==1.8.2` which is not working - pip3 install -U --no-cache-dir pydantic==1.10.10 + pip3 install -U --no-cache-dir pydantic==1.10.10 && \ + pip3 cache purge diff --git a/docker/Dockerfile-cloud b/docker/Dockerfile-cloud index c84ea1dca..e53bba239 100644 --- a/docker/Dockerfile-cloud +++ b/docker/Dockerfile-cloud @@ -14,7 +14,8 @@ COPY scripts/motd /etc/motd RUN pip install jupyterlab notebook ipywidgets && \ jupyter lab clean -RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \ +RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \ + rm -rf /var/lib/apt/lists/* && \ mkdir -p ~/.ssh && \ chmod 700 ~/.ssh && \ printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \