Compare commits

...

7 Commits

Author SHA1 Message Date
Wing Lian
3afc91fba9 run 2.5.1 test without waiting for 1st e2e 2024-12-07 17:25:16 -05:00
Wing Lian
0689419d25 use pr base tag 2024-12-07 17:25:16 -05:00
Wing Lian
e64c32c0bd push test build 2024-12-07 17:25:16 -05:00
Wing Lian
ec819dde3b attempt to build the test images 2024-12-07 17:25:16 -05:00
Wing Lian
fdf4bb5087 fix default base image 2024-12-07 17:25:16 -05:00
Wing Lian
f67d16268c try with default tag 2024-12-07 17:25:16 -05:00
Wing Lian
684b543aa1 experiment with nvcr pytorch image for torch 2.5.1 2024-12-07 17:25:16 -05:00
5 changed files with 86 additions and 76 deletions

View File

@@ -22,36 +22,38 @@ jobs:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- cuda: "121" # - cuda: "121"
cuda_version: 12.1.1 # cuda_version: 12.1.1
cudnn_version: 8 # cudnn_version: 8
python_version: "3.10" # python_version: "3.10"
pytorch: 2.3.1 # pytorch: 2.3.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
- cuda: "121" # from_base_img: ""
cuda_version: 12.1.1 # from_base_tag: ""
cudnn_version: 8 # - cuda: "121"
python_version: "3.11" # cuda_version: 12.1.1
pytorch: 2.3.1 # cudnn_version: 8
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # python_version: "3.11"
- cuda: "124" # pytorch: 2.3.1
cuda_version: 12.4.1 # torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
cudnn_version: "" # from_base_img: ""
python_version: "3.10" # from_base_tag: ""
pytorch: 2.4.1 # - cuda: "124"
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # cuda_version: 12.4.1
- cuda: "124" # cudnn_version: ""
cuda_version: 12.4.1 # python_version: "3.11"
cudnn_version: "" # pytorch: 2.4.1
python_version: "3.11" # torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
pytorch: 2.4.1 # from_base_img: ""
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" # from_base_tag: ""
- cuda: "124" - cuda: "124"
cuda_version: 12.4.1 cuda_version: 12.4.1
cudnn_version: "" cudnn_version: ""
python_version: "3.11" python_version: "3.11"
pytorch: 2.5.1 pytorch: 2.5.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
from_base_img: nvcr.io/nvidia/pytorch
from_base_tag: 24.10-py3
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -61,7 +63,7 @@ jobs:
with: with:
images: | images: |
winglian/axolotl-base winglian/axolotl-base
axolotlai/axolotl-base # axolotlai/axolotl-base
- name: Login to Docker Hub - name: Login to Docker Hub
uses: docker/login-action@v2 uses: docker/login-action@v2
with: with:
@@ -74,7 +76,8 @@ jobs:
with: with:
context: . context: .
file: ./docker/Dockerfile-base file: ./docker/Dockerfile-base
push: ${{ github.event_name != 'pull_request' }} push: true
# push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
build-args: | build-args: |
@@ -84,3 +87,5 @@ jobs:
PYTHON_VERSION=${{ matrix.python_version }} PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch }} PYTORCH_VERSION=${{ matrix.pytorch }}
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }} TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
BASE_IMAGE=${{ matrix.from_base_img || '' }}
BASE_TAG=${{ matrix.from_base_tag || '' }}

View File

@@ -148,63 +148,64 @@ jobs:
run: | run: |
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
docker-e2e-tests-1st: # docker-e2e-tests-1st:
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }} # if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
# this job needs to be run on self-hosted GPU runners... # # this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal] # runs-on: [self-hosted, modal]
timeout-minutes: 90 # timeout-minutes: 90
needs: [pre-commit, pytest, pytest-sdist] # needs: [pre-commit, pytest, pytest-sdist]
#
strategy: # strategy:
fail-fast: false # fail-fast: false
matrix: # matrix:
include: # include:
- cuda: 124 # - cuda: 124
cuda_version: 12.4.1 # cuda_version: 12.4.1
python_version: "3.11" # python_version: "3.11"
pytorch: 2.4.1 # pytorch: 2.4.1
num_gpus: 1 # num_gpus: 1
axolotl_extras: # axolotl_extras:
steps: # steps:
- name: Checkout # - name: Checkout
uses: actions/checkout@v4 # uses: actions/checkout@v4
- name: Install Python # - name: Install Python
uses: actions/setup-python@v5 # uses: actions/setup-python@v5
with: # with:
python-version: "3.10" # python-version: "3.10"
- name: Install Modal # - name: Install Modal
run: | # run: |
python -m pip install --upgrade pip # python -m pip install --upgrade pip
pip install modal==0.63.64 jinja2 # pip install modal==0.63.64 jinja2
- name: Update env vars # - name: Update env vars
run: | # run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV # echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV # echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV # echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV # echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV # echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV # echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
- name: Run tests job on Modal # - name: Run tests job on Modal
run: | # run: |
modal run cicd.tests # modal run cicd.tests
docker-e2e-tests: docker-e2e-tests:
if: github.repository_owner == 'axolotl-ai-cloud' if: github.repository_owner == 'axolotl-ai-cloud'
# this job needs to be run on self-hosted GPU runners... # this job needs to be run on self-hosted GPU runners...
runs-on: [self-hosted, modal] runs-on: [self-hosted, modal]
timeout-minutes: 90 timeout-minutes: 90
needs: [pre-commit, pytest, docker-e2e-tests-1st] # needs: [pre-commit, pytest, docker-e2e-tests-1st]
needs: [pre-commit, pytest]
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
include: include:
- cuda: 121 # - cuda: 121
cuda_version: 12.1.1 # cuda_version: 12.1.1
python_version: "3.10" # python_version: "3.10"
pytorch: 2.3.1 # pytorch: 2.3.1
num_gpus: 1 # num_gpus: 1
axolotl_extras: mamba-ssm # axolotl_extras: mamba-ssm
- cuda: 124 - cuda: 124
cuda_version: 12.4.1 cuda_version: 12.4.1
python_version: "3.11" python_version: "3.11"
@@ -224,7 +225,7 @@ jobs:
pip install modal==0.63.64 jinja2 pip install modal==0.63.64 jinja2
- name: Update env vars - name: Update env vars
run: | run: |
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "BASE_TAG=pr-2139-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV

View File

@@ -1,4 +1,4 @@
FROM axolotlai/axolotl-base:{{ BASE_TAG }} FROM winglian/axolotl-base:{{ BASE_TAG }}
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}" ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"

View File

@@ -1,5 +1,6 @@
ARG BASE_IMAGE=axolotlai/axolotl-base
ARG BASE_TAG=main-base ARG BASE_TAG=main-base
FROM axolotlai/axolotl-base:$BASE_TAG FROM $BASE_IMAGE:$BASE_TAG
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS="" ARG AXOLOTL_EXTRAS=""

View File

@@ -3,7 +3,10 @@ ARG CUDNN_VERSION="8"
ARG UBUNTU_VERSION="22.04" ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4 ARG MAX_JOBS=4
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder ARG BASE_IMAGE=nvidia/cuda
ARG DEFAULT_TAG=${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_TAG=""
FROM ${BASE_IMAGE:-nvidia/cuda}:${BASE_TAG:-${DEFAULT_TAG}} AS base-builder
ENV PATH="/root/miniconda3/bin:${PATH}" ENV PATH="/root/miniconda3/bin:${PATH}"