feat: upgrade transformers to v4.56.1 (#3127)

* feat: upgrade transformers to v4.56 * fix handling of CP/SP now that position_ids are default even for unpacked sequences * feat: monkeypatch list_repo_templates * fix: apply patch for tests only * see if updated main works at least * fix: update to patch release and remove monkeypatch * remove fsdp2 eval patch --------- Co-authored-by: Wing Lian <wing@axolotl.ai>
2025-09-05 22:00:54 +07:00
parent c6ae5c43cb
commit 1d32278755
5 changed files with 5 additions and 35 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ packaging==23.2
 huggingface_hub>=0.33.0
 peft>=0.17.0
-transformers==4.55.4
+transformers==4.56.1
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -80,13 +80,7 @@ class PatchManager:
            patch_maybe_log_save_evaluate,
        )
-        patch_fsdp2 = (
+        patch_evaluation_loop()
            self.cfg.torch_compile
            and self.cfg.fsdp_config
            and self.cfg.fsdp_version == 2
        )
        patch_evaluation_loop(patch_fsdp2)
        patch_maybe_log_save_evaluate()
    def apply_post_model_load_patches(self, model: PreTrainedModel):
--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -28,15 +28,6 @@ PATCHED_EVAL_CODE = {
    "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
 }
 ORIGINAL_FSDP2_CODE = """
    model.eval()
 """
 PATCHED_FSDP2_CODE = """
    if hasattr(model, "eval") and callable(model.eval):
        self.model.eval()
 """
 ORIGINAL_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).mean().item()"
 PATCHED_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).nanmean().item()"
@@ -46,13 +37,7 @@ def check_evaluation_loop_is_patchable() -> bool:
    return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values())
-def check_evaluation_loop_is_fsdp2_patchable() -> bool:
+def patch_evaluation_loop():
    evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
    evaluation_loop_source, _ = detab_code(evaluation_loop_source)
    return ORIGINAL_FSDP2_CODE in evaluation_loop_source
 def patch_evaluation_loop(patch_fsdp2: bool):
    """Patch the evaluation_loop method."""
    # Check if already patched
    if hasattr(Trainer, "_original_evaluation_loop"):
@@ -75,13 +60,6 @@ def patch_evaluation_loop(patch_fsdp2: bool):
        ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"]
    )
    # Apply FSDP2 eval guard patch if needed
    if patch_fsdp2 and ORIGINAL_FSDP2_CODE in evaluation_loop_source:
        evaluation_loop_source = evaluation_loop_source.replace(
            ORIGINAL_FSDP2_CODE, PATCHED_FSDP2_CODE
        )
        LOG.info("Applied FSDP2 eval guard patch to evaluation_loop")
    # Rename the function to avoid conflicts
    evaluation_loop_source = evaluation_loop_source.replace(
        "def evaluation_loop(",
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -48,10 +48,10 @@ def apply_sequence_parallelism(
            - The original sequence length before padding.
            - The number of padding tokens added.
    """
-    original_seq_len = batch["input_ids"].size(1)
+    batch_size, original_seq_len = batch["input_ids"].shape
    # Update ring attention params if needed
-    if batch.get("position_ids") is not None:
+    if batch.get("position_ids") is not None and batch_size == 1:
        update_ring_attn_params(position_ids=batch["position_ids"])
    else:
        # If position_ids aren't already in the batch, create them
--- a/tests/monkeypatch/test_trainer_loss_calc.py
+++ b/tests/monkeypatch/test_trainer_loss_calc.py
@@ -3,7 +3,6 @@
 import unittest
 from axolotl.monkeypatch.transformers.trainer_loss_calc import (
    check_evaluation_loop_is_fsdp2_patchable,
    check_evaluation_loop_is_patchable,
    check_maybe_log_save_evaluate_is_patchable,
 )
@@ -20,7 +19,6 @@ class TestTrainerLossCalc(unittest.TestCase):
        the patched code changes upstream.
        """
        assert check_evaluation_loop_is_patchable()
        assert check_evaluation_loop_is_fsdp2_patchable()
        assert check_maybe_log_save_evaluate_is_patchable()