From 234931d5127a6ac9162e139fc2317ef62707c53b Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 5 Mar 2026 15:04:38 -0500 Subject: [PATCH] extend pytest-sdist timeout to 30 min for slow/flaky tests (#3456) [skip ci] * extend pytest-sdist timeout to 30 min for slow/flaky tests * Also preload the cdn cache so it doesn't get stampeded * fix yaml syntax * missing fields * can't pipe to dev/null * Fix nightlies and add 2.10.0 to multi-gpu suite --- .github/workflows/multi-gpu-e2e.yml | 13 +++++++------ .github/workflows/tests-nightly.yml | 26 +++++++++++++++++++++++--- .github/workflows/tests.yml | 16 ++++++++++++++-- 3 files changed, 44 insertions(+), 11 deletions(-) diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml index 5187a08c7..c1e5c5d75 100644 --- a/.github/workflows/multi-gpu-e2e.yml +++ b/.github/workflows/multi-gpu-e2e.yml @@ -35,12 +35,6 @@ jobs: pytorch: 2.8.0 axolotl_extras: fbgemm-gpu num_gpus: 2 - - cuda: 128 - cuda_version: 12.8.1 - python_version: "3.11" - pytorch: 2.9.1 - axolotl_extras: "fbgemm-gpu" - num_gpus: 2 - cuda: 129 cuda_version: 12.9.1 python_version: "3.12" @@ -55,6 +49,13 @@ jobs: axolotl_extras: # axolotl_extras: fbgemm-gpu num_gpus: 2 + - cuda: 128 + cuda_version: 12.8.1 + python_version: "3.11" + pytorch: 2.10.0 + axolotl_extras: "fbgemm-gpu" + num_gpus: 2 + dockerfile: "Dockerfile-uv.jinja" runs-on: [self-hosted, modal] timeout-minutes: 120 steps: diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml index 33aa8525a..45596a2e1 100644 --- a/.github/workflows/tests-nightly.yml +++ b/.github/workflows/tests-nightly.yml @@ -18,15 +18,27 @@ jobs: env: SKIP: no-commit-to-branch + prime-cdn-s3-cache: + name: Prefetch S3 once to prime the CDN cache + runs-on: ubuntu-latest + if: ${{ !github.event.pull_request.draft }} + timeout-minutes: 10 + steps: + - name: Restore Cache from S3 + id: hf-cache-restore-s3 + run: | + curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null + pytest: name: PyTest runs-on: ubuntu-latest + needs: [prime-cdn-s3-cache] strategy: fail-fast: false max-parallel: 2 matrix: - python_version: ["3.11"] - pytorch_version: ["2.8.0", "2.9.0", "2.9.1"] + python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged + pytorch_version: ["2.8.0", "2.9.1", "2.10.0"] timeout-minutes: 20 steps: @@ -102,16 +114,23 @@ jobs: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.8.0 + pytorch: 2.9.1 num_gpus: 1 axolotl_extras: nightly_build: "true" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" + pytorch: 2.10.0 + num_gpus: 1 + axolotl_extras: + - cuda: 130 + cuda_version: 13.0.0 + python_version: "3.12" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: + dockerfile: "Dockerfile-uv.jinja" nightly_build: "true" steps: - name: Checkout @@ -132,6 +151,7 @@ jobs: echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV + echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV - name: Run tests job on Modal diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 60610450a..abb4cba9f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -46,11 +46,22 @@ jobs: env: SKIP: no-commit-to-branch + prime-cdn-s3-cache: + name: Prefetch S3 once to prime the CDN cache + runs-on: ubuntu-latest + if: ${{ !github.event.pull_request.draft }} + timeout-minutes: 10 + steps: + - name: Restore Cache from S3 + id: hf-cache-restore-s3 + run: | + curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null + pytest: name: PyTest runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} -# needs: [preload-cache] + needs: [prime-cdn-s3-cache] strategy: fail-fast: false matrix: @@ -146,6 +157,7 @@ jobs: name: PyTest from Source Dist runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} + needs: [prime-cdn-s3-cache] strategy: fail-fast: false matrix: @@ -156,7 +168,7 @@ jobs: # pytorch_version: "2.8.0" # - python_version: "3.14" # pytorch_version: "2.9.1" - timeout-minutes: 20 + timeout-minutes: 30 steps: - name: cleanup node