diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4fcf08352..01606f902 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,6 +31,11 @@ jobs: python_version: "3.11" pytorch: 2.7.0 axolotl_extras: + - cuda: 128 + cuda_version: 12.8.1 + python_version: "3.11" + pytorch: 2.7.0 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout @@ -94,6 +99,11 @@ jobs: python_version: "3.11" pytorch: 2.7.0 axolotl_extras: + - cuda: 128 + cuda_version: 12.8.1 + python_version: "3.11" + pytorch: 2.7.0 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c296e2314..69f0a030d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -295,6 +295,7 @@ jobs: find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; docker-e2e-tests-1st: + # Run this job first as a gate for running the remainder of the test matrix if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }} # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] @@ -341,6 +342,8 @@ jobs: # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] timeout-minutes: 90 + # Only run the remainder of the matrix if the first e2e check passed; + # this is to save on wasted compute costs for known failures that get caught in the first run needs: [pre-commit, pytest, docker-e2e-tests-1st] strategy: @@ -365,6 +368,12 @@ jobs: pytorch: 2.7.0 num_gpus: 1 axolotl_extras: + - cuda: 128 + cuda_version: 12.8.1 + python_version: "3.11" + pytorch: 2.7.0 + num_gpus: 1 + axolotl_extras: steps: - name: Checkout uses: actions/checkout@v4