From 1d32278755108c47eafbb6dca95c62e807771351 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 5 Sep 2025 22:00:54 +0700
Subject: [PATCH] feat: upgrade transformers to v4.56.1 (#3127)

* feat: upgrade transformers to v4.56

* fix handling of CP/SP now that position_ids are default even for unpacked sequences

* feat: monkeypatch list_repo_templates

* fix: apply patch for tests only

* see if updated main works at least

* fix: update to patch release and remove monkeypatch

* remove fsdp2 eval patch

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
---
 requirements.txt                              |  2 +-
 src/axolotl/loaders/patch_manager.py          |  8 +------
 .../transformers/trainer_loss_calc.py         | 24 +------------------
 .../utils/ctx_managers/sequence_parallel.py   |  4 ++--
 tests/monkeypatch/test_trainer_loss_calc.py   |  2 --
 5 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9e3dbbca4..1292a179a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ packaging==23.2
 
 huggingface_hub>=0.33.0
 peft>=0.17.0
-transformers==4.55.4
+transformers==4.56.1
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 94b307a62..044c278a3 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -80,13 +80,7 @@ class PatchManager:
             patch_maybe_log_save_evaluate,
         )
 
-        patch_fsdp2 = (
-            self.cfg.torch_compile
-            and self.cfg.fsdp_config
-            and self.cfg.fsdp_version == 2
-        )
-
-        patch_evaluation_loop(patch_fsdp2)
+        patch_evaluation_loop()
         patch_maybe_log_save_evaluate()
 
     def apply_post_model_load_patches(self, model: PreTrainedModel):
diff --git a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
index 012c699fa..c9b968d71 100644
--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -28,15 +28,6 @@ PATCHED_EVAL_CODE = {
     "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
 }
 
-ORIGINAL_FSDP2_CODE = """
-    model.eval()
-"""
-
-PATCHED_FSDP2_CODE = """
-    if hasattr(model, "eval") and callable(model.eval):
-        self.model.eval()
-"""
-
 ORIGINAL_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).mean().item()"
 PATCHED_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).nanmean().item()"
 
@@ -46,13 +37,7 @@ def check_evaluation_loop_is_patchable() -> bool:
     return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values())
 
 
-def check_evaluation_loop_is_fsdp2_patchable() -> bool:
-    evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
-    evaluation_loop_source, _ = detab_code(evaluation_loop_source)
-    return ORIGINAL_FSDP2_CODE in evaluation_loop_source
-
-
-def patch_evaluation_loop(patch_fsdp2: bool):
+def patch_evaluation_loop():
     """Patch the evaluation_loop method."""
     # Check if already patched
     if hasattr(Trainer, "_original_evaluation_loop"):
@@ -75,13 +60,6 @@ def patch_evaluation_loop(patch_fsdp2: bool):
         ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"]
     )
 
-    # Apply FSDP2 eval guard patch if needed
-    if patch_fsdp2 and ORIGINAL_FSDP2_CODE in evaluation_loop_source:
-        evaluation_loop_source = evaluation_loop_source.replace(
-            ORIGINAL_FSDP2_CODE, PATCHED_FSDP2_CODE
-        )
-        LOG.info("Applied FSDP2 eval guard patch to evaluation_loop")
-
     # Rename the function to avoid conflicts
     evaluation_loop_source = evaluation_loop_source.replace(
         "def evaluation_loop(",
diff --git a/src/axolotl/utils/ctx_managers/sequence_parallel.py b/src/axolotl/utils/ctx_managers/sequence_parallel.py
index 1ec91ae2a..78b3d1cae 100644
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -48,10 +48,10 @@ def apply_sequence_parallelism(
             - The original sequence length before padding.
             - The number of padding tokens added.
     """
-    original_seq_len = batch["input_ids"].size(1)
+    batch_size, original_seq_len = batch["input_ids"].shape
 
     # Update ring attention params if needed
-    if batch.get("position_ids") is not None:
+    if batch.get("position_ids") is not None and batch_size == 1:
         update_ring_attn_params(position_ids=batch["position_ids"])
     else:
         # If position_ids aren't already in the batch, create them
diff --git a/tests/monkeypatch/test_trainer_loss_calc.py b/tests/monkeypatch/test_trainer_loss_calc.py
index de3e92621..c72cb621b 100644
--- a/tests/monkeypatch/test_trainer_loss_calc.py
+++ b/tests/monkeypatch/test_trainer_loss_calc.py
@@ -3,7 +3,6 @@
 import unittest
 
 from axolotl.monkeypatch.transformers.trainer_loss_calc import (
-    check_evaluation_loop_is_fsdp2_patchable,
     check_evaluation_loop_is_patchable,
     check_maybe_log_save_evaluate_is_patchable,
 )
@@ -20,7 +19,6 @@ class TestTrainerLossCalc(unittest.TestCase):
         the patched code changes upstream.
         """
         assert check_evaluation_loop_is_patchable()
-        assert check_evaluation_loop_is_fsdp2_patchable()
         assert check_maybe_log_save_evaluate_is_patchable()