From 418ad2b586fa4e67a4b81950f789106b939f7474 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 3 Dec 2024 18:08:46 -0500 Subject: [PATCH] add missing fixture decorator for predownload dataset (#2117) [skip ci] * add missing fixture decorator for predownload dataset * also pre download the tokenizer files --- tests/conftest.py | 1 + tests/prompt_strategies/conftest.py | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 2fc985d3a..3a20bbfdd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,6 +46,7 @@ def download_mhenrichsen_alpaca_2k_w_revision_dataset(): ) +@pytest.fixture(scope="session", autouse=True) def download_mlabonne_finetome_100k_dataset(): # download the dataset snapshot_download("mlabonne/FineTome-100k", repo_type="dataset") diff --git a/tests/prompt_strategies/conftest.py b/tests/prompt_strategies/conftest.py index 43423f725..00a2bf300 100644 --- a/tests/prompt_strategies/conftest.py +++ b/tests/prompt_strategies/conftest.py @@ -4,6 +4,7 @@ shared fixtures for prompt strategies tests import pytest from datasets import Dataset +from huggingface_hub import hf_hub_download from transformers import AutoTokenizer @@ -60,6 +61,17 @@ def fixture_basic_dataset(): @pytest.fixture(name="llama3_tokenizer") def fixture_llama3_tokenizer(): + hf_hub_download( + repo_id="NousResearch/Meta-Llama-3-8B-Instruct", + filename="special_tokens_map.json", + ) + hf_hub_download( + repo_id="NousResearch/Meta-Llama-3-8B-Instruct", + filename="tokenizer_config.json", + ) + hf_hub_download( + repo_id="NousResearch/Meta-Llama-3-8B-Instruct", filename="tokenizer.json" + ) tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct") return tokenizer