From 7651550850b7c8d0420bbf8354b6e3dd2728493b Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 21 Apr 2025 10:31:50 -0400 Subject: [PATCH] make sure to download fixtures for kd test (#2541) * make sure to download fixtures for kd test * use same alpaca dataset --- tests/conftest.py | 26 ++++++++++++++++++++++++++ tests/e2e/multigpu/solo/__init__.py | 2 ++ tests/e2e/multigpu/solo/test_flex.py | 3 ++- tests/e2e/patched/test_resume.py | 3 ++- tests/e2e/solo/test_flex.py | 3 ++- tests/e2e/test_packing_loss.py | 3 ++- 6 files changed, 36 insertions(+), 4 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 97c48db41..3f3cc2732 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -193,6 +193,14 @@ def download_tiny_shakespeare_dataset(): snapshot_download_w_retry("winglian/tiny-shakespeare", repo_type="dataset") +@pytest.fixture(scope="session", autouse=True) +def download_evolkit_kd_sample_dataset(): + # download the dataset + snapshot_download_w_retry( + "axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample", repo_type="dataset" + ) + + @pytest.fixture(scope="session", autouse=True) def download_deepseek_model_fixture(): snapshot_download_w_retry("axolotl-ai-co/DeepSeek-V3-11M", repo_type="model") @@ -208,6 +216,16 @@ def download_huggyllama_model_fixture(): ) +@pytest.fixture(scope="session", autouse=True) +def download_llama33_70b_model_fixture(): + # download the tokenizer only + snapshot_download_w_retry( + "axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer", + repo_type="model", + allow_patterns=["*token*", "config.json"], + ) + + @pytest.fixture(scope="session", autouse=True) def download_llama_1b_model_fixture(): # download the tokenizer only @@ -315,6 +333,14 @@ def download_llama2_model_fixture(): ) +@pytest.fixture(scope="session", autouse=True) +def download_llama32_1b_model_fixture(): + snapshot_download_w_retry( + "osllmai-community/Llama-3.2-1B", + repo_type="model", + ) + + @pytest.fixture @enable_hf_offline def tokenizer_huggyllama( diff --git a/tests/e2e/multigpu/solo/__init__.py b/tests/e2e/multigpu/solo/__init__.py index e69de29bb..ed1ba7dc6 100644 --- a/tests/e2e/multigpu/solo/__init__.py +++ b/tests/e2e/multigpu/solo/__init__.py @@ -0,0 +1,2 @@ +# Tests under this directory should get run "solo" on their own as they +# seem to cause issues when run in the same batch as other tests. diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py index cbe3794b3..471b112c1 100644 --- a/tests/e2e/multigpu/solo/test_flex.py +++ b/tests/e2e/multigpu/solo/test_flex.py @@ -49,8 +49,9 @@ class TestPackedFlex: }, "datasets": [ { - "path": "vicgalle/alpaca-gpt4", + "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, diff --git a/tests/e2e/patched/test_resume.py b/tests/e2e/patched/test_resume.py index f6a3e0109..68489ed03 100644 --- a/tests/e2e/patched/test_resume.py +++ b/tests/e2e/patched/test_resume.py @@ -46,8 +46,9 @@ class TestResumeLlama: }, "datasets": [ { - "path": "vicgalle/alpaca-gpt4", + "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 2, diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py index 6de813e37..71da795f8 100644 --- a/tests/e2e/solo/test_flex.py +++ b/tests/e2e/solo/test_flex.py @@ -41,8 +41,9 @@ class TestPackedFlex(unittest.TestCase): }, "datasets": [ { - "path": "vicgalle/alpaca-gpt4", + "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1, diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py index 4e8e70419..73716f44b 100644 --- a/tests/e2e/test_packing_loss.py +++ b/tests/e2e/test_packing_loss.py @@ -40,8 +40,9 @@ class TestPackedLlama(unittest.TestCase): }, "datasets": [ { - "path": "vicgalle/alpaca-gpt4", + "path": "tatsu-lab/alpaca", "type": "alpaca", + "split": "train[:10%]", }, ], "num_epochs": 1,