add support for CP + torch SDPA

Merge branch 'main' into cp-fix
only patch in CP > 1 case
2025-09-25 12:03:43 -04:00 · 2025-09-24 14:01:23 -04:00 · 2025-09-24 13:36:14 -04:00 · 2025-09-24 13:25:46 -04:00 · 2025-09-24 13:08:38 -04:00
10 changed files with 414 additions and 66 deletions
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -84,6 +84,15 @@ class PatchManager:
        patch_evaluation_loop()
        patch_maybe_log_save_evaluate()

+        if self.cfg.context_parallel_size > 1 and getattr(
+            self.cfg, "flash_attention", False
+        ):
+            from axolotl.monkeypatch.transformers.trainer_context_parallel import (
+                patch_prepare_context_parallel_inputs,
+            )
+
+            patch_prepare_context_parallel_inputs()
+
    def apply_post_model_load_patches(self, model: PreTrainedModel):
        """Apply patches that require the model instance."""
        self._apply_llama_flash_attn_patches(model)
--- a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
@@ -13,21 +13,10 @@ from typing import Callable
 import torch
 import torch.distributed as dist
 import transformers
-import transformers.modeling_flash_attention_utils
+import transformers.modeling_flash_attention_utils as flash_utils
 from ring_flash_attn import ring_flash_attn_func
 from ring_flash_attn.adapters.hf_adapter import check_params
 from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or_equal
-
-try:
-    from transformers.modeling_flash_attention_utils import _flash_supports_window
-except ImportError:
-    try:
-        from transformers.modeling_flash_attention_utils import (
-            _flash_supports_window_size as _flash_supports_window,
-        )
-    except ImportError:
-        _flash_supports_window = True
-
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

 from axolotl.utils.schemas.enums import RingAttnFunc
@@ -118,7 +107,7 @@ def create_flash_attn_forward_varlen_llama3(

        # Handle sliding window
        use_sliding_windows = (
-            _flash_supports_window
+            _flash_windows_supported()
            and sliding_window is not None
            and key_states.shape[1] > sliding_window
        )
@@ -194,3 +183,18 @@ def substitute_hf_flash_attn(
        from ring_flash_attn.adapters.hf_adapter import flash_attention_forward

        ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
+
+
+def _flash_windows_supported() -> bool:
+    """Return whether current transformers build advertises sliding-window support."""
+    support = getattr(flash_utils, "_flash_supports_window", None)
+    if support is None:
+        support = getattr(flash_utils, "_flash_supports_window_size", None)
+
+    if support is None:
+        return True
+
+    if callable(support):
+        return True
+
+    return bool(support)
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -13,18 +13,9 @@ from typing import Optional

 import torch
 import torch.distributed as dist
+import transformers.modeling_flash_attention_utils as flash_utils
 from torch.distributed import DeviceMesh

-try:
-    from transformers.modeling_flash_attention_utils import _flash_supports_window
-except ImportError:
-    try:
-        from transformers.modeling_flash_attention_utils import (
-            _flash_supports_window_size as _flash_supports_window,
-        )
-    except ImportError:
-        _flash_supports_window = True
-
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RingAttnFunc
@@ -83,7 +74,7 @@ def create_ring_flash_attention_forward(

        # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
        use_sliding_windows = (
-            _flash_supports_window
+            _flash_windows_supported()
            and sliding_window is not None
            and key_states.shape[1] > sliding_window
        )
@@ -225,3 +216,19 @@ def update_ring_attn_params(position_ids: torch.Tensor | None):
    cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
    cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
    update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
+
+
+def _flash_windows_supported() -> bool:
+    """Best-effort check for FlashAttention sliding-window support."""
+    support = getattr(flash_utils, "_flash_supports_window", None)
+    if support is None:
+        support = getattr(flash_utils, "_flash_supports_window_size", None)
+
+    if support is None:
+        return True
+
+    if callable(support):
+        # Signature differs across versions; assume support when callable.
+        return True
+
+    return bool(support)
--- a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -0,0 +1,68 @@
+"""Monkey patch to allow context parallelism with FlashAttention in HF Trainer."""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+
+from transformers import Trainer
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":'
+PATCHED_GUARD = (
+    'if model.config._attn_implementation not in ("sdpa", "flash_attention_2"):'
+)
+
+
+def patch_prepare_context_parallel_inputs() -> None:
+    """Relax the SDPA-only guard when running context parallelism with FlashAttention."""
+    if getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False):
+        LOG.debug("Trainer._prepare_context_parallel_inputs already patched")
+        return
+
+    try:
+        original_source = inspect.getsource(Trainer._prepare_context_parallel_inputs)
+    except OSError as exc:  # pragma: no cover - occurs when source is unavailable
+        LOG.warning("Unable to patch Trainer._prepare_context_parallel_inputs: %s", exc)
+        return
+
+    if GUARD_PATTERN not in original_source:
+        LOG.warning(
+            "Expected guard not found in Trainer._prepare_context_parallel_inputs; \n"
+            "skipping FlashAttention context parallelism patch"
+        )
+        return
+
+    patched_source = original_source.replace(GUARD_PATTERN, PATCHED_GUARD)
+    patched_source, _ = detab_code(patched_source)
+    patched_source = patched_source.replace(
+        "def _prepare_context_parallel_inputs(",
+        "def axolotl_prepare_context_parallel_inputs(",
+        1,
+    )
+
+    module_name = Trainer.__module__
+    module = importlib.import_module(module_name)
+
+    # import symbols referenced in the method so exec can succeed
+    items_to_import = []
+    for item in dir(module):
+        if item in patched_source:
+            items_to_import.append(item)
+
+    exec(f"from {module_name} import ({', '.join(items_to_import)})", globals())
+    exec(patched_source, globals())
+
+    Trainer._original_prepare_context_parallel_inputs = (
+        Trainer._prepare_context_parallel_inputs
+    )
+    Trainer._prepare_context_parallel_inputs = axolotl_prepare_context_parallel_inputs
+    Trainer._axolotl_prepare_context_parallel_inputs_source = patched_source
+    Trainer._axolotl_prepare_context_parallel_inputs_patched = True
+    LOG.debug(
+        "Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP"
+    )
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -179,7 +179,11 @@ def execute_training(
                )
            )

-        if cfg.context_parallel_size > 1:
+        use_flash_cp = cfg.context_parallel_size > 1 and bool(
+            getattr(cfg, "flash_attention", False)
+        )
+
+        if use_flash_cp:
            models = [trainer.model]
            if hasattr(trainer, "ref_model") and trainer.ref_model:
                models.append(trainer.ref_model)
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -1,7 +1,6 @@
 """Module with validation methods for config pydantic model."""

 import json
-import sys
 import tempfile
 from pathlib import Path

@@ -1314,50 +1313,40 @@ class ComplexValidationMixin:
        if not self.context_parallel_size:
            self.context_parallel_size = 1
        elif self.context_parallel_size > 1:
-            if not self.flash_attention:
+            use_flash_attention = getattr(self, "flash_attention", False)
+            use_sdp_attention = getattr(self, "sdp_attention", False)
+
+            if not (use_flash_attention or use_sdp_attention):
                raise ValueError(
-                    "flash_attention: true must be set with context_parallel_size > 1"
+                    "context_parallel_size > 1 requires either flash_attention: true "
+                    "or sdp_attention: true"
                )

-            if self.sample_packing and self.micro_batch_size > 1:
-                raise ValueError(
-                    "micro_batch_size must be set to 1 when sample_packing is enabled "
-                    "due to a `ring-flash-attn` requirement"
+            if use_flash_attention:
+                if self.sample_packing and self.micro_batch_size > 1:
+                    raise ValueError(
+                        "micro_batch_size must be set to 1 when sample_packing is enabled "
+                        "due to a `ring-flash-attn` requirement"
+                    )
+
+                try:
+                    import ring_flash_attn  # noqa: F401  # Required after monkey-patching
+                except ImportError as exception:
+                    raise ImportError(
+                        "context_parallel_size > 1 but ring_flash_attn is not installed. "
+                        "Please install it with `pip install axolotl[ring-flash-attn] "
+                        "or `pip install ring-flash-attn>=0.1.4`."
+                    ) from exception
+
+                LOG.warning(
+                    "Sequence parallelism (SP) is enabled with "
+                    f"context_parallel_size={self.context_parallel_size}. "
+                    "Please note that logged losses may differ slightly to the non-SP "
+                    "losses due to transformers Trainer implementation details. "
+                    "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
+                    "for more details."
                )

-            try:
-                import transformers.modeling_flash_attention_utils
-                from transformers.utils import is_flash_attn_greater_or_equal
-
-                transformers.modeling_flash_attention_utils._flash_supports_window = (
-                    True
-                )
-                sys.modules[
-                    "transformers.modeling_flash_attention_utils"
-                ]._flash_supports_window = True
-                sys.modules[
-                    "transformers.modeling_flash_attention_utils"
-                ]._flash_supports_window_size = True
-                sys.modules[
-                    "transformers.modeling_flash_attention_utils"
-                ].is_flash_attn_greater_or_equal = is_flash_attn_greater_or_equal
-                import ring_flash_attn  # noqa: F401  # Required after monkey-patching
-            except ImportError as exception:
-                raise ImportError(
-                    "context_parallel_size > 1 but ring_flash_attn is not installed. "
-                    "Please install it with `pip install axolotl[ring-flash-attn] "
-                    "or `pip install ring-flash-attn>=0.1.4`."
-                ) from exception
-
-            LOG.warning(
-                "Sequence parallelism (SP) is enabled with "
-                f"context_parallel_size={self.context_parallel_size}. "
-                "Please note that logged losses may differ slightly to the non-SP "
-                "losses due to transformers Trainer implementation details. "
-                "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
-                "for more details."
-            )
-
        return self

    @model_validator(mode="after")
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -23,6 +23,8 @@ class TestSequenceParallelism:
        pad_to_sequence_len=True,
        ring_attn_func=None,
        threshold=2.0,
+        flash_attention=True,
+        sdp_attention=False,
    ):
        """Helper method to run sequence parallel tests with different configurations"""
        cfg = DictDefault(
@@ -58,7 +60,8 @@ class TestSequenceParallelism:
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
-                "flash_attention": True,
+                "flash_attention": flash_attention,
+                "sdp_attention": sdp_attention,
                "loss_watchdog_threshold": 5.0,
                "loss_watchdog_patience": 3,
                "bf16": "auto",
@@ -132,3 +135,16 @@ class TestSequenceParallelism:
            ring_attn_func=ring_attn_func,
            threshold=threshold,
        )
+
+    def test_sequence_parallel_training_sdpa(self, temp_dir):
+        """Smoke test for SDPA-based context parallelism."""
+        self._run_sequence_parallel_test(
+            temp_dir,
+            sample_packing=False,
+            micro_batch_size=1,
+            pad_to_sequence_len=True,
+            ring_attn_func=None,
+            threshold=3.0,
+            flash_attention=False,
+            sdp_attention=True,
+        )
--- a/tests/loaders/test_patch_manager_cp.py
+++ b/tests/loaders/test_patch_manager_cp.py
@@ -0,0 +1,74 @@
+"""Tests for PatchManager context parallel patch selection."""
+
+import addict
+
+from axolotl.loaders.patch_manager import PatchManager
+from axolotl.utils.dict import DictDefault
+
+
+def _stub_transformers_patches(monkeypatch):
+    """Replace trainer loss patchers with no-ops for isolation."""
+    monkeypatch.setattr(
+        "axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop",
+        lambda: None,
+    )
+    monkeypatch.setattr(
+        "axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate",
+        lambda: None,
+    )
+
+
+def test_patch_manager_applies_flash_cp_patch(monkeypatch):
+    """When flash attention is enabled, we patch Trainer for CP."""
+    _stub_transformers_patches(monkeypatch)
+
+    patch_calls = {"count": 0}
+
+    def stub_patch():
+        patch_calls["count"] += 1
+
+    monkeypatch.setattr(
+        "axolotl.monkeypatch.transformers.trainer_context_parallel.patch_prepare_context_parallel_inputs",
+        stub_patch,
+    )
+
+    cfg = DictDefault(
+        {
+            "context_parallel_size": 2,
+            "flash_attention": True,
+            "sdp_attention": False,
+        }
+    )
+
+    manager = PatchManager(cfg, addict.Dict())
+    manager._apply_transformers_patches()
+
+    assert patch_calls["count"] == 1
+
+
+def test_patch_manager_skips_flash_patch_for_sdpa(monkeypatch):
+    """When only SDPA is requested, we should not patch Trainer."""
+    _stub_transformers_patches(monkeypatch)
+
+    patch_calls = {"count": 0}
+
+    def stub_patch():
+        patch_calls["count"] += 1
+
+    monkeypatch.setattr(
+        "axolotl.monkeypatch.transformers.trainer_context_parallel.patch_prepare_context_parallel_inputs",
+        stub_patch,
+    )
+
+    cfg = DictDefault(
+        {
+            "context_parallel_size": 2,
+            "flash_attention": False,
+            "sdp_attention": True,
+        }
+    )
+
+    manager = PatchManager(cfg, addict.Dict())
+    manager._apply_transformers_patches()
+
+    assert patch_calls["count"] == 0
--- a/tests/monkeypatch/test_trainer_context_parallel_patch.py
+++ b/tests/monkeypatch/test_trainer_context_parallel_patch.py
@@ -0,0 +1,66 @@
+"""Tests for the HF Trainer context parallel patch."""
+
+import pytest
+from transformers import Trainer
+
+from axolotl.monkeypatch.transformers.trainer_context_parallel import (
+    GUARD_PATTERN,
+    PATCHED_GUARD,
+    patch_prepare_context_parallel_inputs,
+)
+
+
+@pytest.fixture
+def restore_trainer_prepare_method():
+    """Ensure Trainer._prepare_context_parallel_inputs is restored after a test."""
+    original_method = getattr(
+        Trainer,
+        "_original_prepare_context_parallel_inputs",
+        Trainer._prepare_context_parallel_inputs,
+    )
+    patched_attr_present = hasattr(
+        Trainer, "_axolotl_prepare_context_parallel_inputs_patched"
+    )
+
+    yield
+
+    Trainer._prepare_context_parallel_inputs = original_method
+    if patched_attr_present:
+        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
+    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
+        delattr(Trainer, "_original_prepare_context_parallel_inputs")
+    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source"):
+        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source")
+
+
+def test_patch_attention_guard(restore_trainer_prepare_method):
+    """Patch should swap the guard to allow sdpa or flash attention."""
+    # Ensure we start from the unpatched method
+    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
+        Trainer._prepare_context_parallel_inputs = (
+            Trainer._original_prepare_context_parallel_inputs
+        )
+        delattr(Trainer, "_original_prepare_context_parallel_inputs")
+    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched"):
+        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
+
+    patch_prepare_context_parallel_inputs()
+
+    patched_method = Trainer._prepare_context_parallel_inputs
+    assert patched_method is not None
+    assert getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False)
+
+    source = Trainer._axolotl_prepare_context_parallel_inputs_source
+    assert GUARD_PATTERN not in source
+    assert PATCHED_GUARD in source
+
+
+def test_patch_is_idempotent(restore_trainer_prepare_method):
+    """Calling the patch twice should leave the same patched function in place."""
+    patch_prepare_context_parallel_inputs()
+    first_patched = Trainer._prepare_context_parallel_inputs
+
+    patch_prepare_context_parallel_inputs()
+    second_patched = Trainer._prepare_context_parallel_inputs
+
+    assert first_patched is second_patched
--- a/tests/test_train_context_parallel.py
+++ b/tests/test_train_context_parallel.py
@@ -0,0 +1,111 @@
+"""Unit tests for choosing the correct context parallel implementation."""
+
+from types import SimpleNamespace
+
+from axolotl.train import execute_training
+from axolotl.utils.dict import DictDefault
+
+
+class DummyTrainer:
+    """Minimal trainer stub to exercise execute_training."""
+
+    def __init__(self):
+        self.model = object()
+        self.ref_model = None
+        self.accelerator = SimpleNamespace(torch_device_mesh=None)
+        self.train_called = False
+
+    def train(self, resume_from_checkpoint=None):  # pylint: disable=unused-argument
+        self.train_called = True
+
+
+class DummyPluginManager:
+    """Minimal plugin manager stub."""
+
+    @staticmethod
+    def post_train(cfg, model):  # pylint: disable=unused-argument
+        return None
+
+
+class DummyContext:
+    """Test context manager that records entries/exits."""
+
+    def __init__(self, recorder, **kwargs):
+        recorder.append({"kwargs": kwargs})
+        self.recorder = recorder
+
+    def __enter__(self):
+        self.recorder[-1]["entered"] = True
+        return self
+
+    def __exit__(self, exc_type, exc, tb):  # pylint: disable=unused-argument
+        self.recorder[-1]["exited"] = True
+        return False
+
+
+def _base_cfg(**overrides):
+    base = {
+        "context_parallel_size": 2,
+        "gradient_accumulation_steps": 1,
+        "ring_attn_func": None,
+        "heads_k_stride": None,
+        "rl": None,
+        "flash_optimum": False,
+    }
+    base.update(overrides)
+    return DictDefault(base)
+
+
+def test_execute_training_uses_ring_when_flash(monkeypatch):
+    """FlashAttention CP should engage the custom ring context manager."""
+    recorder: list[dict] = []
+
+    monkeypatch.setattr(
+        "axolotl.train.SequenceParallelContextManager",
+        lambda **kwargs: DummyContext(recorder, **kwargs),
+    )
+    monkeypatch.setattr(
+        "axolotl.train.PluginManager.get_instance",
+        lambda: DummyPluginManager(),
+    )
+
+    cfg = _base_cfg(flash_attention=True, sdp_attention=False)
+    trainer = DummyTrainer()
+
+    execute_training(cfg, trainer, resume_from_checkpoint=None)
+
+    assert trainer.train_called
+    assert len(recorder) == 1
+    assert recorder[0]["kwargs"]["context_parallel_size"] == 2
+    assert recorder[0].get("entered") is True
+    assert recorder[0].get("exited") is True
+
+
+def test_execute_training_uses_transformers_cp_for_sdpa(monkeypatch):
+    """SDPA CP should bypass the ring context manager."""
+    invoked = {"count": 0}
+
+    class NoOpContext:
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc, tb):  # pylint: disable=unused-argument
+            return False
+
+    monkeypatch.setattr(
+        "axolotl.train.SequenceParallelContextManager",
+        lambda **kwargs: invoked.__setitem__("count", invoked["count"] + 1)
+        or NoOpContext(),
+    )
+    monkeypatch.setattr(
+        "axolotl.train.PluginManager.get_instance",
+        lambda: DummyPluginManager(),
+    )
+
+    cfg = _base_cfg(flash_attention=False, sdp_attention=True)
+    trainer = DummyTrainer()
+
+    execute_training(cfg, trainer, resume_from_checkpoint=None)
+
+    assert trainer.train_called
+    assert invoked["count"] == 0
Author	SHA1	Message	Date
Dan Saunders	09725be990	add support for CP + torch SDPA	2025-09-25 12:03:43 -04:00
Dan Saunders	f9bd6936c1	Merge branch 'main' into cp-fix	2025-09-24 14:01:23 -04:00
Dan Saunders	b9a3bfee5a	only patch in CP > 1 case	2025-09-24 13:36:14 -04:00
Dan Saunders	08124a7c92	nits	2025-09-24 13:25:46 -04:00
Dan Saunders	56e0a77e0d	patch transformers to allow CP + FA2	2025-09-24 13:08:38 -04:00