From a85efffbef716e20f7b51faac7a84d379d8f7474 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 18 Jun 2025 15:46:14 -0400 Subject: [PATCH] bump transformers==4.52.4 (#2800) [skip ci] * bump transformers==4.52.4 * don't use hf offline for qwen tokenizer * increase timeout * don't use methodtype * increase timeout * better assertion logging * upgrade deepspeed version too --- cicd/e2e_tests.py | 2 +- cicd/multigpu.py | 2 +- requirements.txt | 2 +- setup.py | 2 +- src/axolotl/integrations/kd/kernels/models.py | 3 +-- tests/e2e/multigpu/patched/test_sp.py | 5 +++- tests/e2e/multigpu/solo/test_flex.py | 2 +- tests/e2e/multigpu/test_gemma3.py | 2 +- tests/e2e/multigpu/test_llama.py | 24 +++++++++---------- tests/e2e/multigpu/test_ray.py | 4 ++-- tests/e2e/patched/test_fa_xentropy.py | 2 +- tests/e2e/patched/test_unsloth_qlora.py | 6 ++--- tests/e2e/solo/test_flex.py | 2 +- tests/e2e/test_llama_pretrain.py | 2 +- tests/e2e/test_packing_loss.py | 2 +- tests/e2e/test_qat.py | 2 +- tests/e2e/test_reward_model_smollm2.py | 2 +- .../test_chat_templates_thinking.py | 3 --- 18 files changed, 34 insertions(+), 35 deletions(-) diff --git a/cicd/e2e_tests.py b/cicd/e2e_tests.py index ce9c605c7..5d2b6fed1 100644 --- a/cicd/e2e_tests.py +++ b/cicd/e2e_tests.py @@ -6,7 +6,7 @@ from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd @app.function( image=cicd_image, gpu=GPU_CONFIG, - timeout=90 * 60, # 90 min + timeout=120 * 60, # 90 min cpu=8.0, memory=131072, volumes=VOLUME_CONFIG, diff --git a/cicd/multigpu.py b/cicd/multigpu.py index a2dd8d0b3..848110a84 100644 --- a/cicd/multigpu.py +++ b/cicd/multigpu.py @@ -69,7 +69,7 @@ def run_cmd(cmd: str, run_folder: str): @app.function( image=cicd_image, gpu=GPU_CONFIG, - timeout=90 * 60, + timeout=120 * 60, cpu=16.0, memory=131072 * N_GPUS, volumes=VOLUME_CONFIG, diff --git a/requirements.txt b/requirements.txt index cf8caba00..8bd77ab5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ packaging==23.2 huggingface_hub==0.32.2 peft==0.15.2 -transformers==4.52.3 +transformers==4.52.4 tokenizers>=0.21.1 accelerate==1.7.0 datasets==3.6.0 diff --git a/setup.py b/setup.py index 28f71f789..08c39c71c 100644 --- a/setup.py +++ b/setup.py @@ -118,7 +118,7 @@ extras_require = { "yunchang==0.6.0", ], "deepspeed": [ - "deepspeed==0.17.0", + "deepspeed==0.17.1", "deepspeed-kernels", ], "mamba-ssm": [ diff --git a/src/axolotl/integrations/kd/kernels/models.py b/src/axolotl/integrations/kd/kernels/models.py index bfd752964..5a7c286bc 100644 --- a/src/axolotl/integrations/kd/kernels/models.py +++ b/src/axolotl/integrations/kd/kernels/models.py @@ -2,7 +2,6 @@ model patcher for chunked top-k kl-div """ -from types import MethodType from typing import Optional, Union, Unpack import torch @@ -95,4 +94,4 @@ def apply_kernel(model_type): model_cls_prefix = "".join([part.capitalize() for part in model_type.split("_")]) module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"]) model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM") - model_cls.forward = MethodType(kldiv_forward_llama_like, model_cls) + model_cls.forward = kldiv_forward_llama_like diff --git a/tests/e2e/multigpu/patched/test_sp.py b/tests/e2e/multigpu/patched/test_sp.py index e90def2b7..8883e0135 100644 --- a/tests/e2e/multigpu/patched/test_sp.py +++ b/tests/e2e/multigpu/patched/test_sp.py @@ -91,7 +91,10 @@ class TestSequenceParallelism: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", threshold, "Train Loss is too high" + temp_dir + "/runs", + "train/train_loss", + threshold, + "Train Loss (%s) is too high", ) @pytest.mark.parametrize( diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py index 42c3c00c8..c8f14330d 100644 --- a/tests/e2e/multigpu/solo/test_flex.py +++ b/tests/e2e/multigpu/solo/test_flex.py @@ -85,5 +85,5 @@ class TestPackedFlex: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py index 9bff25f40..b4cb6e59d 100644 --- a/tests/e2e/multigpu/test_gemma3.py +++ b/tests/e2e/multigpu/test_gemma3.py @@ -91,5 +91,5 @@ class TestMultiGPUGemma3: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 9c4bf5054..a8ed6bda0 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -89,7 +89,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -154,7 +154,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) def test_dpo_lora_ddp(self, temp_dir): @@ -232,7 +232,7 @@ class TestMultiGPULlama: temp_dir + "/runs", "train/train_loss", loss_threshold, - "Train Loss is too high", + "Train Loss (%s) is too high", ) def test_dpo_qlora_ddp(self, temp_dir): @@ -310,7 +310,7 @@ class TestMultiGPULlama: temp_dir + "/runs", "train/train_loss", loss_threshold, - "Train Loss is too high", + "Train Loss (%s) is too high", ) @pytest.mark.parametrize( @@ -380,7 +380,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -452,7 +452,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_2_6_0 @@ -533,7 +533,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high" ) def test_fsdp_qlora_prequant_packed(self, temp_dir): @@ -613,7 +613,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -697,7 +697,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -771,7 +771,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -845,7 +845,7 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @pytest.mark.skip( @@ -912,5 +912,5 @@ class TestMultiGPULlama: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py index f2c812eb5..22023507a 100644 --- a/tests/e2e/multigpu/test_ray.py +++ b/tests/e2e/multigpu/test_ray.py @@ -75,7 +75,7 @@ class TestMultiGPURay: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) @require_torch_lt_2_6_0 @@ -133,5 +133,5 @@ class TestMultiGPURay: ) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py index 4e3cbc50d..ca8b21178 100644 --- a/tests/e2e/patched/test_fa_xentropy.py +++ b/tests/e2e/patched/test_fa_xentropy.py @@ -78,5 +78,5 @@ class TestFAXentropyLlama: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py index 9567c0b18..69171481c 100644 --- a/tests/e2e/patched/test_unsloth_qlora.py +++ b/tests/e2e/patched/test_unsloth_qlora.py @@ -73,7 +73,7 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) def test_unsloth_llama_qlora_unpacked(self, temp_dir): @@ -123,7 +123,7 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) @pytest.mark.parametrize( @@ -178,5 +178,5 @@ class TestUnslothQLoRA: check_model_output_exists(temp_dir, cfg) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py index 8d1a0c7d1..f6b8c6283 100644 --- a/tests/e2e/solo/test_flex.py +++ b/tests/e2e/solo/test_flex.py @@ -63,5 +63,5 @@ class TestPackedFlex(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py index 6944c6f5e..fdebf2173 100644 --- a/tests/e2e/test_llama_pretrain.py +++ b/tests/e2e/test_llama_pretrain.py @@ -69,5 +69,5 @@ class TestPretrainLlama: temp_dir + "/runs", "train/train_loss", loss_threshold, - "Train Loss is too high", + "Train Loss (%s) is too high", ) diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py index 463f7c838..cc2db72e0 100644 --- a/tests/e2e/test_packing_loss.py +++ b/tests/e2e/test_packing_loss.py @@ -62,5 +62,5 @@ class TestPackedLlama(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high" ) diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py index 964bf3c1c..ef726079d 100644 --- a/tests/e2e/test_qat.py +++ b/tests/e2e/test_qat.py @@ -129,5 +129,5 @@ class TestQATLlama: temp_dir + "/runs", "train/train_loss", loss_threshold, - "Train Loss is too high", + "Train Loss (%s) is too high", ) diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py index 304fda1cc..5d52bcc86 100644 --- a/tests/e2e/test_reward_model_smollm2.py +++ b/tests/e2e/test_reward_model_smollm2.py @@ -66,6 +66,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high" ) check_model_output_exists(temp_dir, cfg) diff --git a/tests/prompt_strategies/test_chat_templates_thinking.py b/tests/prompt_strategies/test_chat_templates_thinking.py index 79429b731..e807111aa 100644 --- a/tests/prompt_strategies/test_chat_templates_thinking.py +++ b/tests/prompt_strategies/test_chat_templates_thinking.py @@ -11,8 +11,6 @@ from axolotl.prompt_strategies.chat_template import ( ) from axolotl.utils.dict import DictDefault -from tests.hf_offline_utils import enable_hf_offline - @pytest.fixture(name="messages_w_reasoning") def messages_w_reasoning_fixture(): @@ -59,7 +57,6 @@ def messages_w_reasoning_fixture(): @pytest.fixture(name="qwen3_tokenizer") -@enable_hf_offline def qwen3_tokenizer_fixture( download_qwen3_half_billion_model, ): # pylint: disable=unused-argument