From aae4337f4066be8c74319c337858ee8f4e4334a5 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 21 Mar 2025 10:17:25 -0400 Subject: [PATCH] add 12.8.1 cuda to the base matrix (#2426) * add 12.8.1 cuda to the base matrix * use nightly * bump deepspeed and set no binary * deepspeed binary fixes hopefully * install deepspeed by itself * multiline fix * make sure ninja is installed * try with reversion of packaging/setuptools/wheel install * use license instead of license-file * try rolling back packaging and setuptools versions * comment out license for validation for now * make sure packaging version is consistent * more parity across tests and docker images for packaging/setuptools --- .github/workflows/base.yml | 8 +++++- .github/workflows/pypi.yml | 2 +- .github/workflows/tests-nightly.yml | 4 +-- .github/workflows/tests.yml | 4 +-- README.md | 2 +- cicd/Dockerfile.jinja | 2 +- docker/Dockerfile-base | 2 +- docker/Dockerfile-base-nightly | 39 +++++++++++++++++++++++++++++ pyproject.toml | 4 +-- requirements.txt | 6 ++--- setup.py | 2 +- 11 files changed, 60 insertions(+), 15 deletions(-) create mode 100644 docker/Dockerfile-base-nightly diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index e45436e93..cf5c1d45d 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -40,6 +40,12 @@ jobs: python_version: "3.11" pytorch: 2.6.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + - cuda: "128" + cuda_version: 12.8.1 + cudnn_version: "" + python_version: "3.11" + pytorch: nightly + torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" steps: - name: Checkout uses: actions/checkout@v4 @@ -61,7 +67,7 @@ jobs: uses: docker/build-push-action@v4 with: context: . - file: ./docker/Dockerfile-base + file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || './docker/Dockerfile-base' }} push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} labels: ${{ steps.metadata.outputs.labels }} diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index c64afc0c9..24e3c497d 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -40,7 +40,7 @@ jobs: - name: Install dependencies run: | - pip3 install wheel packaging + pip3 install wheel packaging==23.2 pip3 install --no-build-isolation -e . pip3 install -r requirements-dev.txt -r requirements-tests.txt diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml index 2733b8605..efad7cc37 100644 --- a/.github/workflows/tests-nightly.yml +++ b/.github/workflows/tests-nightly.yml @@ -42,7 +42,7 @@ jobs: - name: upgrade pip run: | pip3 install --upgrade pip - pip3 install --upgrade packaging setuptools wheel + pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel - name: Install PyTorch run: | @@ -59,7 +59,7 @@ jobs: - name: Install dependencies run: | pip3 install --upgrade pip - pip3 install --upgrade packaging + pip3 install --upgrade packaging==23.2 pip3 install --no-build-isolation -U -e . python scripts/unsloth_install.py | sh python scripts/cutcrossentropy_install.py | sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 889339005..32bb42821 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -74,7 +74,7 @@ jobs: - name: upgrade pip run: | pip3 install --upgrade pip - pip3 install --upgrade packaging setuptools wheel + pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel - name: Install PyTorch run: | @@ -147,7 +147,7 @@ jobs: - name: upgrade pip run: | pip3 install --upgrade pip - pip3 install --upgrade packaging setuptools setuptools_scm build wheel + pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel - name: Install PyTorch run: | diff --git a/README.md b/README.md index 953bc0be5..343816aff 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ Features: ### Installation ```bash -pip3 install -U packaging setuptools wheel ninja +pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation axolotl[flash-attn,deepspeed] # Download example axolotl configs, deepspeed configs diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja index 0fb9e90ba..b212a0065 100644 --- a/cicd/Dockerfile.jinja +++ b/cicd/Dockerfile.jinja @@ -31,7 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \ sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \ fi -RUN pip3 install -U packaging setuptools wheel +RUN pip install packaging==23.2 setuptools==75.8.0 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ pip install --no-build-isolation -e .[deepspeed,flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ else \ diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base index 0146fd9fe..e989152ec 100644 --- a/docker/Dockerfile-base +++ b/docker/Dockerfile-base @@ -28,7 +28,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" WORKDIR /workspace -RUN python3 -m pip install --upgrade pip && pip3 install -U packaging setuptools wheel && \ +RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \ python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \ python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \ python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" diff --git a/docker/Dockerfile-base-nightly b/docker/Dockerfile-base-nightly new file mode 100644 index 000000000..85805ea41 --- /dev/null +++ b/docker/Dockerfile-base-nightly @@ -0,0 +1,39 @@ +ARG CUDA_VERSION="12.8.1" +ARG CUDNN_VERSION="8" +ARG UBUNTU_VERSION="22.04" +ARG MAX_JOBS=4 + +FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder + +ENV PATH="/root/miniconda3/bin:${PATH}" + +ARG PYTHON_VERSION="3.11" +ARG PYTORCH_VERSION="nightly" +ARG CUDA="128" +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" + +ENV PYTHON_VERSION=$PYTHON_VERSION +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST + +RUN apt-get update \ + && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \ + && wget \ + https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + && mkdir /root/.conda \ + && bash Miniconda3-latest-Linux-x86_64.sh -b \ + && rm -f Miniconda3-latest-Linux-x86_64.sh \ + && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}" + +ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" + +WORKDIR /workspace + +RUN python3 -m pip install --upgrade pip && pip3 install packaging && \ + python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \ + python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \ + python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" + +RUN git lfs install --skip-repo && \ + pip3 install awscli && \ + # The base image ships with `pydantic==1.8.2` which is not working + pip3 install -U --no-cache-dir pydantic==1.10.10 diff --git a/pyproject.toml b/pyproject.toml index e3c6cae69..eb85691dd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging>=24.2"] +requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==23.2"] build-backend = "setuptools.build_meta" [project] @@ -8,7 +8,7 @@ dynamic = ["version", "dependencies", "optional-dependencies"] description = "LLM Trainer" readme = "README.md" requires-python = ">=3.10" -license-files = ["LICENSE"] +# license = "Apache-2.0" [project.scripts] axolotl = "axolotl.cli.main:main" diff --git a/requirements.txt b/requirements.txt index ebefc7ad4..495f43af6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ # START section of dependencies that don't install on Darwin/MacOS -bitsandbytes==0.45.2 +bitsandbytes==0.45.3 triton>=3.0.0 mamba-ssm==1.2.0.post1 flash-attn==2.7.4.post1 @@ -10,14 +10,14 @@ autoawq==0.2.7.post3 liger-kernel==0.5.3 # END section -packaging==24.2 +packaging==23.2 peft==0.15.0 transformers==4.49.0 tokenizers>=0.21.1 accelerate==1.5.2 datasets==3.4.1 -deepspeed==0.16.1 +deepspeed==0.16.4 trl==0.15.1 optimum==1.16.2 diff --git a/setup.py b/setup.py index b84c34525..c4ffcdaeb 100644 --- a/setup.py +++ b/setup.py @@ -128,7 +128,7 @@ setup( "flash-attn==2.7.4.post1", ], "deepspeed": [ - "deepspeed==0.16.1", + "deepspeed==0.16.4", "deepspeed-kernels", ], "mamba-ssm": [