From a85efffbef716e20f7b51faac7a84d379d8f7474 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 18 Jun 2025 15:46:14 -0400
Subject: [PATCH] bump transformers==4.52.4 (#2800) [skip ci]

* bump transformers==4.52.4

* don't use hf offline for qwen tokenizer

* increase timeout

* don't use methodtype

* increase timeout

* better assertion logging

* upgrade deepspeed version too
---
 cicd/e2e_tests.py                             |  2 +-
 cicd/multigpu.py                              |  2 +-
 requirements.txt                              |  2 +-
 setup.py                                      |  2 +-
 src/axolotl/integrations/kd/kernels/models.py |  3 +--
 tests/e2e/multigpu/patched/test_sp.py         |  5 +++-
 tests/e2e/multigpu/solo/test_flex.py          |  2 +-
 tests/e2e/multigpu/test_gemma3.py             |  2 +-
 tests/e2e/multigpu/test_llama.py              | 24 +++++++++----------
 tests/e2e/multigpu/test_ray.py                |  4 ++--
 tests/e2e/patched/test_fa_xentropy.py         |  2 +-
 tests/e2e/patched/test_unsloth_qlora.py       |  6 ++---
 tests/e2e/solo/test_flex.py                   |  2 +-
 tests/e2e/test_llama_pretrain.py              |  2 +-
 tests/e2e/test_packing_loss.py                |  2 +-
 tests/e2e/test_qat.py                         |  2 +-
 tests/e2e/test_reward_model_smollm2.py        |  2 +-
 .../test_chat_templates_thinking.py           |  3 ---
 18 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/cicd/e2e_tests.py b/cicd/e2e_tests.py
index ce9c605c7..5d2b6fed1 100644
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -6,7 +6,7 @@ from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
 @app.function(
     image=cicd_image,
     gpu=GPU_CONFIG,
-    timeout=90 * 60,  # 90 min
+    timeout=120 * 60,  # 90 min
     cpu=8.0,
     memory=131072,
     volumes=VOLUME_CONFIG,
diff --git a/cicd/multigpu.py b/cicd/multigpu.py
index a2dd8d0b3..848110a84 100644
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -69,7 +69,7 @@ def run_cmd(cmd: str, run_folder: str):
 @app.function(
     image=cicd_image,
     gpu=GPU_CONFIG,
-    timeout=90 * 60,
+    timeout=120 * 60,
     cpu=16.0,
     memory=131072 * N_GPUS,
     volumes=VOLUME_CONFIG,
diff --git a/requirements.txt b/requirements.txt
index cf8caba00..8bd77ab5e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ packaging==23.2
 
 huggingface_hub==0.32.2
 peft==0.15.2
-transformers==4.52.3
+transformers==4.52.4
 tokenizers>=0.21.1
 accelerate==1.7.0
 datasets==3.6.0
diff --git a/setup.py b/setup.py
index 28f71f789..08c39c71c 100644
--- a/setup.py
+++ b/setup.py
@@ -118,7 +118,7 @@ extras_require = {
         "yunchang==0.6.0",
     ],
     "deepspeed": [
-        "deepspeed==0.17.0",
+        "deepspeed==0.17.1",
         "deepspeed-kernels",
     ],
     "mamba-ssm": [
diff --git a/src/axolotl/integrations/kd/kernels/models.py b/src/axolotl/integrations/kd/kernels/models.py
index bfd752964..5a7c286bc 100644
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -2,7 +2,6 @@
 model patcher for chunked top-k kl-div
 """
 
-from types import MethodType
 from typing import Optional, Union, Unpack
 
 import torch
@@ -95,4 +94,4 @@ def apply_kernel(model_type):
     model_cls_prefix = "".join([part.capitalize() for part in model_type.split("_")])
     module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
     model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
-    model_cls.forward = MethodType(kldiv_forward_llama_like, model_cls)
+    model_cls.forward = kldiv_forward_llama_like
diff --git a/tests/e2e/multigpu/patched/test_sp.py b/tests/e2e/multigpu/patched/test_sp.py
index e90def2b7..8883e0135 100644
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -91,7 +91,10 @@ class TestSequenceParallelism:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", threshold, "Train Loss is too high"
+            temp_dir + "/runs",
+            "train/train_loss",
+            threshold,
+            "Train Loss (%s) is too high",
         )
 
     @pytest.mark.parametrize(
diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py
index 42c3c00c8..c8f14330d 100644
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -85,5 +85,5 @@ class TestPackedFlex:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
index 9bff25f40..b4cb6e59d 100644
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -91,5 +91,5 @@ class TestMultiGPUGemma3:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 9c4bf5054..a8ed6bda0 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -89,7 +89,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -154,7 +154,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     def test_dpo_lora_ddp(self, temp_dir):
@@ -232,7 +232,7 @@ class TestMultiGPULlama:
             temp_dir + "/runs",
             "train/train_loss",
             loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
         )
 
     def test_dpo_qlora_ddp(self, temp_dir):
@@ -310,7 +310,7 @@ class TestMultiGPULlama:
             temp_dir + "/runs",
             "train/train_loss",
             loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
         )
 
     @pytest.mark.parametrize(
@@ -380,7 +380,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -452,7 +452,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_2_6_0
@@ -533,7 +533,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
         )
 
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
@@ -613,7 +613,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -697,7 +697,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -771,7 +771,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -845,7 +845,7 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.skip(
@@ -912,5 +912,5 @@ class TestMultiGPULlama:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
index f2c812eb5..22023507a 100644
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -75,7 +75,7 @@ class TestMultiGPURay:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
     @require_torch_lt_2_6_0
@@ -133,5 +133,5 @@ class TestMultiGPURay:
         )
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py
index 4e3cbc50d..ca8b21178 100644
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -78,5 +78,5 @@ class TestFAXentropyLlama:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py
index 9567c0b18..69171481c 100644
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -73,7 +73,7 @@ class TestUnslothQLoRA:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
 
     def test_unsloth_llama_qlora_unpacked(self, temp_dir):
@@ -123,7 +123,7 @@ class TestUnslothQLoRA:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
 
     @pytest.mark.parametrize(
@@ -178,5 +178,5 @@ class TestUnslothQLoRA:
         check_model_output_exists(temp_dir, cfg)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py
index 8d1a0c7d1..f6b8c6283 100644
--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
index 6944c6f5e..fdebf2173 100644
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -69,5 +69,5 @@ class TestPretrainLlama:
             temp_dir + "/runs",
             "train/train_loss",
             loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
         )
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
index 463f7c838..cc2db72e0 100644
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase):
         train(cfg=cfg, dataset_meta=dataset_meta)
 
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
         )
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
index 964bf3c1c..ef726079d 100644
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -129,5 +129,5 @@ class TestQATLlama:
             temp_dir + "/runs",
             "train/train_loss",
             loss_threshold,
-            "Train Loss is too high",
+            "Train Loss (%s) is too high",
         )
diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py
index 304fda1cc..5d52bcc86 100644
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high"
+            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
         )
         check_model_output_exists(temp_dir, cfg)
diff --git a/tests/prompt_strategies/test_chat_templates_thinking.py b/tests/prompt_strategies/test_chat_templates_thinking.py
index 79429b731..e807111aa 100644
--- a/tests/prompt_strategies/test_chat_templates_thinking.py
+++ b/tests/prompt_strategies/test_chat_templates_thinking.py
@@ -11,8 +11,6 @@ from axolotl.prompt_strategies.chat_template import (
 )
 from axolotl.utils.dict import DictDefault
 
-from tests.hf_offline_utils import enable_hf_offline
-
 
 @pytest.fixture(name="messages_w_reasoning")
 def messages_w_reasoning_fixture():
@@ -59,7 +57,6 @@ def messages_w_reasoning_fixture():
 
 
 @pytest.fixture(name="qwen3_tokenizer")
-@enable_hf_offline
 def qwen3_tokenizer_fixture(
     download_qwen3_half_billion_model,
 ):  # pylint: disable=unused-argument