hf offline decorator for tests to workaround rate limits (#2452) [skip ci]

* hf offline decorator for tests to workaround rate limits

* fail quicker so we can see logs

* try new cache name

* limit files downloaded

* phi mini predownload

* offline decorator for phi tokenizer

* handle meta llama 8b offline too

* make sure to return fixtures if they are wrapped too

* more fixes

* more things offline

* more offline things

* fix the env var

* fix the model name

* handle gemma also

* force reload of modules to recheck offline status

* prefetch mistral too

* use reset_sessions so hub picks up offline mode

* more fixes

* rename so it doesn't seem like a context manager

* fix backoff

* switch out tinyshakespeare dataset since it runs a py script to fetch data and doesn't work offline

* include additional dataset

* more fixes

* more fixes

* replace tiny shakespeaere dataset

* skip some tests for now

* use more robust check using snapshot download to determine if a dataset name is on the hub

* typo for skip reason

* use local_files_only

* more fixtures

* remove local only

* use tiny shakespeare as pretrain dataset and streaming can't be offline even if precached

* make sure fixtures aren't offline

improve the offline reset
try bumping version of datasets
reorder reloading and setting
prime a new cache
run the tests now with fresh cache
try with a static cache

* now run all the ci again with hopefully a correct cache

* skip wonky tests for now

* skip wonky tests for now

* handle offline mode for model card creation
This commit is contained in:
Wing Lian
2025-03-28 19:20:46 -04:00
committed by GitHub
parent a4e430e7c4
commit 05f03b541a
21 changed files with 381 additions and 50 deletions

View File

@@ -7,14 +7,16 @@ import tempfile
import unittest
from pathlib import Path
from conftest import snapshot_download_w_retry
import pytest
from constants import (
ALPACA_MESSAGES_CONFIG_OG,
ALPACA_MESSAGES_CONFIG_REVISION,
SPECIAL_TOKENS,
)
from datasets import Dataset
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer
from utils import enable_hf_offline
from axolotl.utils.data import load_tokenized_prepared_datasets
from axolotl.utils.data.rl import load_prepare_preference_datasets
@@ -24,6 +26,7 @@ from axolotl.utils.dict import DictDefault
class TestDatasetPreparation(unittest.TestCase):
"""Test a configured dataloader."""
@enable_hf_offline
def setUp(self) -> None:
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
self.tokenizer.add_special_tokens(SPECIAL_TOKENS)
@@ -38,6 +41,8 @@ class TestDatasetPreparation(unittest.TestCase):
]
)
@pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
@enable_hf_offline
def test_load_hub(self):
"""Core use case. Verify that processing data from the hub works"""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -64,16 +69,21 @@ class TestDatasetPreparation(unittest.TestCase):
assert "attention_mask" in dataset.features
assert "labels" in dataset.features
@enable_hf_offline
@pytest.mark.skip("datasets bug with local datasets when offline")
def test_load_local_hub(self):
"""Niche use case. Verify that a local copy of a hub dataset can be loaded"""
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
tmp_ds_path.mkdir(parents=True, exist_ok=True)
snapshot_download_w_retry(
snapshot_path = snapshot_download(
repo_id="mhenrichsen/alpaca_2k_test",
repo_type="dataset",
local_dir=tmp_ds_path,
)
# offline mode doesn't actually copy it to local_dir, so we
# have to copy all the contents in the dir manually from the returned snapshot_path
shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)
prepared_path = Path(tmp_dir) / "prepared"
# Right now a local copy that doesn't fully conform to a dataset
@@ -106,6 +116,7 @@ class TestDatasetPreparation(unittest.TestCase):
assert "labels" in dataset.features
shutil.rmtree(tmp_ds_path)
@enable_hf_offline
def test_load_from_save_to_disk(self):
"""Usual use case. Verify datasets saved via `save_to_disk` can be loaded."""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -135,6 +146,7 @@ class TestDatasetPreparation(unittest.TestCase):
assert "attention_mask" in dataset.features
assert "labels" in dataset.features
@enable_hf_offline
def test_load_from_dir_of_parquet(self):
"""Usual use case. Verify a directory of parquet files can be loaded."""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -171,6 +183,7 @@ class TestDatasetPreparation(unittest.TestCase):
assert "attention_mask" in dataset.features
assert "labels" in dataset.features
@enable_hf_offline
def test_load_from_dir_of_json(self):
"""Standard use case. Verify a directory of json files can be loaded."""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -207,6 +220,7 @@ class TestDatasetPreparation(unittest.TestCase):
assert "attention_mask" in dataset.features
assert "labels" in dataset.features
@enable_hf_offline
def test_load_from_single_parquet(self):
"""Standard use case. Verify a single parquet file can be loaded."""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -237,6 +251,7 @@ class TestDatasetPreparation(unittest.TestCase):
assert "attention_mask" in dataset.features
assert "labels" in dataset.features
@enable_hf_offline
def test_load_from_single_json(self):
"""Standard use case. Verify a single json file can be loaded."""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -267,6 +282,8 @@ class TestDatasetPreparation(unittest.TestCase):
assert "attention_mask" in dataset.features
assert "labels" in dataset.features
@pytest.mark.skip(reason="TODO: fix hf offline mode for CI rate limits")
@enable_hf_offline
def test_load_hub_with_dpo(self):
"""Verify that processing dpo data from the hub works"""
@@ -285,6 +302,8 @@ class TestDatasetPreparation(unittest.TestCase):
assert len(train_dataset) == 1800
assert "conversation" in train_dataset.features
@pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
@enable_hf_offline
def test_load_hub_with_revision(self):
"""Verify that processing data from the hub works with a specific revision"""
with tempfile.TemporaryDirectory() as tmp_dir:
@@ -316,6 +335,7 @@ class TestDatasetPreparation(unittest.TestCase):
assert "attention_mask" in dataset.features
assert "labels" in dataset.features
@enable_hf_offline
def test_load_hub_with_revision_with_dpo(self):
"""Verify that processing dpo data from the hub works with a specific revision"""
@@ -334,17 +354,20 @@ class TestDatasetPreparation(unittest.TestCase):
assert len(train_dataset) == 1800
assert "conversation" in train_dataset.features
@enable_hf_offline
@pytest.mark.skip("datasets bug with local datasets when offline")
def test_load_local_hub_with_revision(self):
"""Verify that a local copy of a hub dataset can be loaded with a specific revision"""
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
tmp_ds_path.mkdir(parents=True, exist_ok=True)
snapshot_download_w_retry(
snapshot_path = snapshot_download(
repo_id="mhenrichsen/alpaca_2k_test",
repo_type="dataset",
local_dir=tmp_ds_path,
revision="d05c1cb",
)
shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)
prepared_path = Path(tmp_dir) / "prepared"
cfg = DictDefault(
@@ -375,17 +398,19 @@ class TestDatasetPreparation(unittest.TestCase):
assert "labels" in dataset.features
shutil.rmtree(tmp_ds_path)
@enable_hf_offline
def test_loading_local_dataset_folder(self):
"""Verify that a dataset downloaded to a local folder can be loaded"""
with tempfile.TemporaryDirectory() as tmp_dir:
tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
tmp_ds_path.mkdir(parents=True, exist_ok=True)
snapshot_download_w_retry(
snapshot_path = snapshot_download(
repo_id="mhenrichsen/alpaca_2k_test",
repo_type="dataset",
local_dir=tmp_ds_path,
)
shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)
prepared_path = Path(tmp_dir) / "prepared"
cfg = DictDefault(