fix: make prepare_context_parallel_inputs no-op

2026-03-20 16:30:58 +07:00
4 changed files with 8 additions and 145 deletions
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -133,13 +133,6 @@ class PatchManager:
        patch_evaluation_loop()
        patch_maybe_log_save_evaluate()
        if self.cfg.context_parallel_size > 1:
            from axolotl.monkeypatch.transformers.trainer_context_parallel import (
                patch_prepare_context_parallel_inputs,
            )
            patch_prepare_context_parallel_inputs()
    def apply_post_model_build_patches(self, model: PreTrainedModel):
        """Apply patches right after model build, before post-load setup."""
        self._finalize_moe_expert_quantization(model)
--- a/src/axolotl/monkeypatch/accelerate/parallelism_config.py
+++ b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
@@ -81,6 +81,7 @@ def patch_prepare_cp():
    import contextlib
    from accelerate import Accelerator
    from transformers import Trainer
    def patched_prepare_cp(self, *args):
        if self.parallelism_config.cp_backend == "deepspeed":
@@ -95,4 +96,11 @@ def patch_prepare_cp():
        self._cp_context = _noop_cp_context
        return args
    def _noop_prepare_context_parallel_inputs(self, model, inputs):
        return contextlib.nullcontext, inputs
    # prevent double CP partition
    Accelerator._prepare_cp = patched_prepare_cp
    # remove unneeded calculation upstream
    Trainer._prepare_context_parallel_inputs = _noop_prepare_context_parallel_inputs
--- a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -1,72 +0,0 @@
 """Monkey patch to allow context parallelism with FlashAttention in HF Trainer."""
 from __future__ import annotations
 import importlib
 import inspect
 from transformers import Trainer
 from axolotl.monkeypatch.utils import detab_code
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":'
 PATCHED_GUARD = 'if (attn_impl := (getattr(model.config, "_attn_implementation", None) or getattr(model.model.config, "_attn_implementation", None))) and attn_impl not in ("sdpa", "flash_attention_2"):'
 def patch_prepare_context_parallel_inputs() -> None:
    """Relax the SDPA-only guard when running context parallelism with FlashAttention."""
    if getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False):
        LOG.debug("Trainer._prepare_context_parallel_inputs already patched")
        return
    try:
        original_source = inspect.getsource(Trainer._prepare_context_parallel_inputs)
    except OSError as exc:  # pragma: no cover - occurs when source is unavailable
        LOG.warning("Unable to patch Trainer._prepare_context_parallel_inputs: %s", exc)
        return
    if GUARD_PATTERN not in original_source:
        LOG.warning(
            "Expected guard not found in Trainer._prepare_context_parallel_inputs; \n"
            "skipping FlashAttention context parallelism patch"
        )
        return
    patched_source = original_source.replace(GUARD_PATTERN, PATCHED_GUARD)
    patched_source, _ = detab_code(patched_source)
    patched_source = patched_source.replace(
        "def _prepare_context_parallel_inputs(",
        "def axolotl_prepare_context_parallel_inputs(",
        1,
    )
    module_name = Trainer.__module__
    module = importlib.import_module(module_name)
    # import symbols referenced in the method so exec can succeed
    items_to_import = []
    for item in dir(module):
        if item in patched_source:
            items_to_import.append(item)
    # Use a separate namespace to capture the exec'd function
    namespace = {}
    exec(f"from {module_name} import ({', '.join(items_to_import)})", namespace)
    exec(patched_source, namespace)
    # Explicitly get the function from the namespace
    axolotl_prepare_context_parallel_inputs = namespace[
        "axolotl_prepare_context_parallel_inputs"
    ]
    Trainer._original_prepare_context_parallel_inputs = (
        Trainer._prepare_context_parallel_inputs
    )
    Trainer._prepare_context_parallel_inputs = axolotl_prepare_context_parallel_inputs
    Trainer._axolotl_prepare_context_parallel_inputs_source = patched_source
    Trainer._axolotl_prepare_context_parallel_inputs_patched = True
    LOG.debug(
        "Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP"
    )
--- a/tests/monkeypatch/test_trainer_context_parallel_patch.py
+++ b/tests/monkeypatch/test_trainer_context_parallel_patch.py
@@ -1,66 +0,0 @@
 """Tests for the HF Trainer context parallel patch."""
 import pytest
 from transformers import Trainer
 from axolotl.monkeypatch.transformers.trainer_context_parallel import (
    GUARD_PATTERN,
    PATCHED_GUARD,
    patch_prepare_context_parallel_inputs,
 )
@pytest.fixture
 def restore_trainer_prepare_method():
    """Ensure Trainer._prepare_context_parallel_inputs is restored after a test."""
    original_method = getattr(
        Trainer,
        "_original_prepare_context_parallel_inputs",
        Trainer._prepare_context_parallel_inputs,
    )
    patched_attr_present = hasattr(
        Trainer, "_axolotl_prepare_context_parallel_inputs_patched"
    )
    yield
    Trainer._prepare_context_parallel_inputs = original_method
    if patched_attr_present:
        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
        delattr(Trainer, "_original_prepare_context_parallel_inputs")
    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source"):
        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source")
 def test_patch_attention_guard(restore_trainer_prepare_method):
    """Patch should swap the guard to allow sdpa or flash attention."""
    # Ensure we start from the unpatched method
    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
        Trainer._prepare_context_parallel_inputs = (
            Trainer._original_prepare_context_parallel_inputs
        )
        delattr(Trainer, "_original_prepare_context_parallel_inputs")
    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched"):
        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
    patch_prepare_context_parallel_inputs()
    patched_method = Trainer._prepare_context_parallel_inputs
    assert patched_method is not None
    assert getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False)
    source = Trainer._axolotl_prepare_context_parallel_inputs_source
    assert GUARD_PATTERN not in source
    assert PATCHED_GUARD in source
 def test_patch_is_idempotent(restore_trainer_prepare_method):
    """Calling the patch twice should leave the same patched function in place."""
    patch_prepare_context_parallel_inputs()
    first_patched = Trainer._prepare_context_parallel_inputs
    patch_prepare_context_parallel_inputs()
    second_patched = Trainer._prepare_context_parallel_inputs
    assert first_patched is second_patched