repop cache (#2639)

* repop cache * pre-cache as a step * fix the name * add reason for pytest skipif * restore pytorch matrix * remove max-parallel now that we've optimized this a bit
2025-05-06 11:09:07 -04:00
parent a980618fd0
commit f720b6e72d
2 changed files with 120 additions and 50 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -44,12 +44,98 @@ jobs:
        env:
          SKIP: no-commit-to-branch

-  pytest:
-    name: PyTest
+  preload-cache:
+    name: Preload HF cache
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
-      max-parallel: 2
+      matrix:
+        python_version: ["3.11"]
+        pytorch_version: ["2.6.0"]
+    timeout-minutes: 20
+
+    env:
+      AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
+
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+
+      - name: Restore HF cache
+        id: hf-cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ runner.os }}-hf-hub-cache-v2
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python_version }}
+          cache: 'pip' # caching pip dependencies
+
+      - name: upgrade pip
+        run: |
+          pip3 install --upgrade pip
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+
+      - name: Install PyTorch
+        run: |
+          pip3 install torch==${{ matrix.pytorch_version }}
+
+      - name: Install dependencies
+        run: |
+          pip3 show torch
+          pip3 install --no-build-isolation -U -e .
+          python scripts/unsloth_install.py | sh
+          python scripts/cutcrossentropy_install.py | sh
+          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+
+      - name: Make sure PyTorch version wasn't clobbered
+        run: |
+          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
+
+      - name: Ensure axolotl CLI was installed
+        run: |
+          axolotl --help
+
+      - name: Pre-Download dataset fixture
+        run: |
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+
+      - name: Run tests
+        run: |
+          pytest -v tests/conftest.py
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          files: ./coverage.xml
+          flags: unittests,pytorch-${{ matrix.pytorch_version }}
+          fail_ci_if_error: false
+
+      - name: cleanup pip cache
+        run: |
+          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+
+      - name: Save HF cache
+        id: hf-cache
+        uses: actions/cache/save@v4
+        with:
+          path: |
+            /home/runner/.cache/huggingface/hub/datasets--*
+            /home/runner/.cache/huggingface/hub/models--*
+          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
+
+  pytest:
+    name: PyTest
+    runs-on: ubuntu-latest
+    needs: [preload-cache]
+    strategy:
+      fail-fast: false
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -121,21 +207,12 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
+    needs: [preload-cache]
    strategy:
      fail-fast: false
-      max-parallel: 1
      matrix:
        python_version: ["3.11"]
        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
@@ -199,15 +276,6 @@ jobs:
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

-      - name: Save HF cache
-        id: hf-cache
-        uses: actions/cache/save@v4
-        with:
-          path: |
-            /home/runner/.cache/huggingface/hub/datasets--*
-            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
-
  docker-e2e-tests-1st:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,6 +4,7 @@ shared pytest fixtures

 import functools
 import importlib
+import os
 import shutil
 import sys
 import tempfile
@@ -529,31 +530,32 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(


 # # pylint: disable=redefined-outer-name,unused-argument
-# def test_load_fixtures(
-#     download_smollm2_135m_model,
-#     download_llama_68m_random_model,
-#     download_qwen_2_5_half_billion_model,
-#     download_tatsu_lab_alpaca_dataset,
-#     download_mhenrichsen_alpaca_2k_dataset,
-#     download_mhenrichsen_alpaca_2k_w_revision_dataset,
-#     download_mlabonne_finetome_100k_dataset,
-#     download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
-#     download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset,
-#     download_fozzie_alpaca_dpo_dataset,
-#     download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
-#     download_argilla_dpo_pairs_dataset,
-#     download_tiny_shakespeare_dataset,
-#     download_deepseek_model_fixture,
-#     download_huggyllama_model_fixture,
-#     download_llama_1b_model_fixture,
-#     download_llama3_8b_model_fixture,
-#     download_llama3_8b_instruct_model_fixture,
-#     download_phi_35_mini_model_fixture,
-#     download_phi_3_medium_model_fixture,
-#     download_mistral_7b_model_fixture,
-#     download_gemma_2b_model_fixture,
-#     download_gemma2_9b_model_fixture,
-#     download_mlx_mistral_7b_model_fixture,
-#     download_llama2_model_fixture,
-# ):
-#     pass
+@pytest.mark.skipif(
+    os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1",
+    reason="Not running in CI cache preload",
+)
+def test_load_fixtures(
+    download_smollm2_135m_model,
+    download_qwen_2_5_half_billion_model,
+    download_tatsu_lab_alpaca_dataset,
+    download_mhenrichsen_alpaca_2k_dataset,
+    download_mhenrichsen_alpaca_2k_w_revision_dataset,
+    download_mlabonne_finetome_100k_dataset,
+    download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
+    download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
+    download_argilla_dpo_pairs_dataset,
+    download_tiny_shakespeare_dataset,
+    download_deepseek_model_fixture,
+    download_huggyllama_model_fixture,
+    download_llama_1b_model_fixture,
+    download_llama3_8b_model_fixture,
+    download_llama3_8b_instruct_model_fixture,
+    download_phi_35_mini_model_fixture,
+    download_phi_3_medium_model_fixture,
+    download_mistral_7b_model_fixture,
+    download_gemma_2b_model_fixture,
+    download_gemma2_9b_model_fixture,
+    download_mlx_mistral_7b_model_fixture,
+    download_llama2_model_fixture,
+):
+    pass