From 70ca1b22915188b0ae0a7ac986b97671be6c379e Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 3 Jul 2025 12:21:39 -0400 Subject: [PATCH] fix nightlies to use correct cache (#2848) [skip ci] * fix nightlies to use correct cache * fix for handling None for bf16 --- .github/workflows/tests-nightly.yml | 115 ++-------------------------- src/axolotl/core/builders/base.py | 4 +- 2 files changed, 10 insertions(+), 109 deletions(-) diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml index 539f7f71b..e6a1d8e58 100644 --- a/.github/workflows/tests-nightly.yml +++ b/.github/workflows/tests-nightly.yml @@ -18,96 +18,9 @@ jobs: env: SKIP: no-commit-to-branch - preload-cache: - name: Preload HF cache - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - python_version: ["3.11"] - pytorch_version: ["2.6.0"] - timeout-minutes: 20 - - env: - AXOLOTL_IS_CI_CACHE_PRELOAD: "1" - - steps: - - name: Check out repository code - uses: actions/checkout@v4 - - - name: Restore HF cache - id: hf-cache-restore - uses: actions/cache/restore@v4 - with: - path: | - /home/runner/.cache/huggingface/hub/datasets--* - /home/runner/.cache/huggingface/hub/models--* - key: ${{ runner.os }}-hf-hub-cache-v2 - - - name: Setup Python - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python_version }} - cache: 'pip' # caching pip dependencies - - - name: upgrade pip - run: | - pip3 install --upgrade pip - pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel - - - name: Install PyTorch - run: | - pip3 install torch==${{ matrix.pytorch_version }} - - - name: Install dependencies - run: | - pip3 show torch - pip3 install --no-build-isolation -U -e . - python scripts/unsloth_install.py | sh - python scripts/cutcrossentropy_install.py | sh - pip3 install -r requirements-dev.txt -r requirements-tests.txt - - - name: Make sure PyTorch version wasn't clobbered - run: | - python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__" - - - name: Ensure axolotl CLI was installed - run: | - axolotl --help - - - name: Pre-Download dataset fixture - run: | - huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures - - - name: Run tests - run: | - pytest -v tests/conftest.py - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - files: ./coverage.xml - flags: unittests,pytorch-${{ matrix.pytorch_version }} - fail_ci_if_error: false - - - name: cleanup pip cache - run: | - find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; - - - name: Save HF cache - id: hf-cache - uses: actions/cache/save@v4 - with: - path: | - /home/runner/.cache/huggingface/hub/datasets--* - /home/runner/.cache/huggingface/hub/models--* - key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }} - pytest: name: PyTest runs-on: ubuntu-latest - needs: [preload-cache] strategy: fail-fast: false max-parallel: 2 @@ -120,14 +33,11 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 - - name: Restore HF cache - id: hf-cache-restore - uses: actions/cache/restore@v4 - with: - path: | - /home/runner/.cache/huggingface/hub/datasets--* - /home/runner/.cache/huggingface/hub/models--* - key: ${{ runner.os }}-hf-hub-cache-v2 + - name: Restore Cache from S3 + id: hf-cache-restore-s3 + run: | + mkdir -p /home/runner/.cache/huggingface/hub + curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd - name: Setup Python uses: actions/setup-python@v5 @@ -168,10 +78,6 @@ jobs: run: | axolotl --help - - name: Pre-Download dataset fixture - run: | - huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures - - name: Run tests run: | pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ @@ -193,15 +99,8 @@ jobs: fail-fast: false matrix: include: - - cuda: 124 - cuda_version: 12.4.1 - python_version: "3.11" - pytorch: 2.5.1 - num_gpus: 1 - axolotl_extras: - nightly_build: "true" - - cuda: 124 - cuda_version: 12.4.1 + - cuda: 126 + cuda_version: 12.6.3 python_version: "3.11" pytorch: 2.6.0 num_gpus: 1 diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py index eed43e542..7d278db66 100644 --- a/src/axolotl/core/builders/base.py +++ b/src/axolotl/core/builders/base.py @@ -219,7 +219,9 @@ class TrainerBuilderBase(abc.ABC): if self.cfg.bf16 == "full": training_args_kwargs["bf16_full_eval"] = True else: - training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16 + bf16 = self.cfg.bf16 or self.cfg.bfloat16 + bf16 = bf16 if bf16 is not None else False + training_args_kwargs["bf16"] = bf16 def _configure_scheduler(self, training_args_kwargs: dict): if self.cfg.lr_scheduler in ["one_cycle", "rex"]: