various tests fixes for flakey tests (#2110)

* add mhenrichsen/alpaca_2k_test with revision dataset download fixture for flaky tests * log slowest tests * pin pynvml==11.5.3 * fix load local hub path * optimize for speed w smaller models and val_set_size * replace pynvml * make the resume from checkpoint e2e faster * make tests smaller
2024-12-02 17:28:58 -05:00
parent b0fbd4d11d
commit c0c53eb62f
13 changed files with 78 additions and 44 deletions
--- a/tests/test_perplexity.py
+++ b/tests/test_perplexity.py
@@ -7,7 +7,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer

 from axolotl.utils.callbacks.perplexity import Perplexity

-MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"


@fixture()
@@ -22,7 +22,9 @@ def model():

@fixture()
 def tokenizer():
-    return AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    tokenizer_ = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    tokenizer_.add_special_tokens({"pad_token": "<|endoftext|>"})
+    return tokenizer_


 def test_perplexity_longer_than_stride(model, metric):
@@ -33,7 +35,7 @@ One day, a little fish named Fin was swimming near the shore. He saw a big crab
 """
    result = metric.compute(model, [sample_text])
    ppl = result["score"]
-    assert round(ppl, 2) == 5.37
+    assert round(ppl, 2) == 7.41


 def test_perplexity_short(model, metric):
@@ -41,4 +43,4 @@ def test_perplexity_short(model, metric):
    sample_text = "Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun."
    result = metric.compute(model, [sample_text])
    ppl = result["score"]
-    assert round(ppl, 2) == 10.02
+    assert round(ppl, 2) == 10.33