diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index bb0b35d98..d755b6326 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -19,22 +19,12 @@ jobs: cuda_version: 11.8.0 python_version: "3.9" pytorch: 2.0.1 - axolotl_extras: + torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX" - cuda: "118" cuda_version: 11.8.0 python_version: "3.10" pytorch: 2.0.1 - axolotl_extras: - - cuda: "117" - cuda_version: 11.7.1 - python_version: "3.9" - pytorch: 1.13.1 - axolotl_extras: - - cuda: "118" - cuda_version: 11.8.0 - python_version: "3.9" - pytorch: 2.0.1 - axolotl_extras: gptq + torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX" steps: - name: Checkout uses: actions/checkout@v3 @@ -63,4 +53,4 @@ jobs: CUDA=${{ matrix.cuda }} PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch }} - AXOLOTL_EXTRAS=${{ matrix.axolotl_extras }} + TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 259896f26..9446e03a1 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -29,11 +29,6 @@ jobs: python_version: "3.9" pytorch: 2.0.1 axolotl_extras: gptq - - cuda: cu117 - cuda_version: 11.7.1 - python_version: "3.9" - pytorch: 1.13.1 - axolotl_extras: runs-on: self-hosted steps: - name: Checkout @@ -55,7 +50,7 @@ jobs: with: context: . build-args: | - BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} + BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }} file: ./docker/Dockerfile push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} @@ -82,11 +77,6 @@ jobs: python_version: "3.9" pytorch: 2.0.1 axolotl_extras: gptq - - cuda: 117 - cuda_version: 11.7.1 - python_version: "3.9" - pytorch: 1.13.1 - axolotl_extras: runs-on: self-hosted steps: - name: Checkout diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base index 6862569b9..aec727c1a 100644 --- a/docker/Dockerfile-base +++ b/docker/Dockerfile-base @@ -8,7 +8,7 @@ FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION a ENV PATH="/root/miniconda3/bin:${PATH}" ARG PYTHON_VERSION="3.9" -ARG PYTORCH="2.0.0" +ARG PYTORCH_VERSION="2.0.1" ARG CUDA="118" ENV PYTHON_VERSION=$PYTHON_VERSION @@ -29,18 +29,18 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}" WORKDIR /workspace RUN python3 -m pip install --upgrade pip && pip3 install packaging && \ - python3 -m pip install --no-cache-dir -U torch==${PYTORCH} torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu$CUDA + python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA FROM base-builder AS flash-attn-builder WORKDIR /workspace -ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" RUN git clone https://github.com/Dao-AILab/flash-attention.git && \ cd flash-attention && \ - git checkout 9ee0ff1 && \ + git checkout v2.0.1 && \ python3 setup.py bdist_wheel && \ cd csrc/fused_dense_lib && \ python3 setup.py bdist_wheel && \ @@ -53,7 +53,7 @@ RUN git clone https://github.com/Dao-AILab/flash-attention.git && \ FROM base-builder AS deepspeed-builder -ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" WORKDIR /workspace @@ -74,6 +74,9 @@ RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \ FROM base-builder +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST + # recompile apex RUN python3 -m pip uninstall -y apex RUN git clone https://github.com/NVIDIA/apex