name: Tests Nightly against upstream main on: workflow_dispatch: schedule: - cron: "0 0 * * *" # Runs at 00:00 UTC every day pull_request: types: [opened, synchronize, reopened, ready_for_review] paths: - ".github/workflows/tests-nightly.yml" permissions: contents: read env: UV_SYSTEM_PYTHON: "1" jobs: pre-commit: name: pre-commit runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" cache: "pip" # caching pip dependencies - uses: pre-commit/action@v3.0.1 env: SKIP: no-commit-to-branch prime-cdn-s3-cache: name: Prefetch S3 once to prime the CDN cache runs-on: ubuntu-latest if: ${{ !github.event.pull_request.draft }} timeout-minutes: 10 steps: - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null pytest: name: PyTest runs-on: ubuntu-latest needs: [prime-cdn-s3-cache] strategy: fail-fast: false matrix: python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged pytorch_version: ["2.9.1", "2.10.0"] timeout-minutes: 20 steps: - name: Check out repository code uses: actions/checkout@v4 - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | mkdir -p /home/runner/.cache/huggingface/hub curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd - name: Setup Python uses: actions/setup-python@v5 with: python-version: ${{ matrix.python_version }} - name: Install uv uses: astral-sh/setup-uv@v7 - name: Install PyTorch run: | uv pip install torch==${{ matrix.pytorch_version }} torchvision uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt - name: Install dependencies run: | uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt python scripts/cutcrossentropy_install.py --uv | sh uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \ codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse - name: Override with nightly HF packages run: | uv pip install --no-deps \ "transformers @ git+https://github.com/huggingface/transformers.git@main" \ "peft @ git+https://github.com/huggingface/peft.git@main" \ "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \ "trl @ git+https://github.com/huggingface/trl.git@main" \ "datasets @ git+https://github.com/huggingface/datasets.git@main" - name: Make sure PyTorch version wasn't clobbered run: | python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'" - name: Ensure axolotl CLI was installed run: | axolotl --help - name: Run tests run: | pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ pytest -v --durations=10 tests/patched/ pytest -v --durations=10 tests/cli/ docker-e2e-tests: if: github.repository_owner == 'axolotl-ai-cloud' # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] timeout-minutes: 120 needs: [pre-commit, pytest] strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: nightly_build: "true" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.10.0 num_gpus: 1 axolotl_extras: - cuda: 130 cuda_version: 13.0.0 python_version: "3.12" pytorch: 2.9.1 num_gpus: 1 axolotl_extras: nightly_build: "true" steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV - name: Run tests job on Modal env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | modal run cicd.e2e_tests docker-e2e-multigpu-tests: if: github.repository_owner == 'axolotl-ai-cloud' # this job needs to be run on self-hosted GPU runners... runs-on: [self-hosted, modal] timeout-minutes: 120 needs: [pre-commit, pytest, docker-e2e-tests] strategy: fail-fast: false matrix: include: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.9.1 num_gpus: 2 axolotl_extras: nightly_build: "true" steps: - name: Checkout uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install Modal run: | python -m pip install --upgrade pip pip install modal==1.3.0.post1 jinja2 - name: Update env vars run: | echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV - name: Run tests job on Modal env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} run: | modal run cicd.multigpu