bump transformers==4.52.4 (#2800) [skip ci]

* bump transformers==4.52.4 * don't use hf offline for qwen tokenizer * increase timeout * don't use methodtype * increase timeout * better assertion logging * upgrade deepspeed version too
2025-06-18 15:46:14 -04:00
parent 06a648263b
commit a85efffbef
18 changed files with 34 additions and 35 deletions
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -91,7 +91,10 @@ class TestSequenceParallelism:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", threshold, "Train Loss is too high"
+            temp_dir + "/runs",
+            "train/train_loss",
+            threshold,
+            "Train Loss (%s) is too high",
        )

    @pytest.mark.parametrize(
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -85,5 +85,5 @@ class TestPackedFlex:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -91,5 +91,5 @@ class TestMultiGPUGemma3:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -89,7 +89,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -154,7 +154,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    def test_dpo_lora_ddp(self, temp_dir):
@@ -232,7 +232,7 @@ class TestMultiGPULlama:
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
        )

    def test_dpo_qlora_ddp(self, temp_dir):
@@ -310,7 +310,7 @@ class TestMultiGPULlama:
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
        )

    @pytest.mark.parametrize(
@@ -380,7 +380,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -452,7 +452,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @require_torch_2_6_0
@@ -533,7 +533,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )

    def test_fsdp_qlora_prequant_packed(self, temp_dir):
@@ -613,7 +613,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -697,7 +697,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -771,7 +771,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -845,7 +845,7 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.skip(
@@ -912,5 +912,5 @@ class TestMultiGPULlama:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -75,7 +75,7 @@ class TestMultiGPURay:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @require_torch_lt_2_6_0
@@ -133,5 +133,5 @@ class TestMultiGPURay:
        )

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -78,5 +78,5 @@ class TestFAXentropyLlama:
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -73,7 +73,7 @@ class TestUnslothQLoRA:
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )

    def test_unsloth_llama_qlora_unpacked(self, temp_dir):
@@ -123,7 +123,7 @@ class TestUnslothQLoRA:
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
@@ -178,5 +178,5 @@ class TestUnslothQLoRA:
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase):
        train(cfg=cfg, dataset_meta=dataset_meta)

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -69,5 +69,5 @@ class TestPretrainLlama:
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
        )
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase):
        train(cfg=cfg, dataset_meta=dataset_meta)

        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -129,5 +129,5 @@ class TestQATLlama:
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
        )
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )
        check_model_output_exists(temp_dir, cfg)
--- a/tests/prompt_strategies/test_chat_templates_thinking.py
+++ b/tests/prompt_strategies/test_chat_templates_thinking.py
@@ -11,8 +11,6 @@ from axolotl.prompt_strategies.chat_template import (
 )
 from axolotl.utils.dict import DictDefault

-from tests.hf_offline_utils import enable_hf_offline
-

@pytest.fixture(name="messages_w_reasoning")
 def messages_w_reasoning_fixture():
@@ -59,7 +57,6 @@ def messages_w_reasoning_fixture():


@pytest.fixture(name="qwen3_tokenizer")
-@enable_hf_offline
 def qwen3_tokenizer_fixture(
    download_qwen3_half_billion_model,
 ):  # pylint: disable=unused-argument