make sure to patch all the loaded models

2025-04-06 14:45:30 -04:00
parent 7e410ab480
commit 1a5d445413
1 changed files with 13 additions and 4 deletions
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -1,5 +1,6 @@
 """Flex attention monkey patch"""

+import sys
 from typing import Optional, Tuple, Union

 import torch
@@ -52,9 +53,9 @@ def patch_flex_wrapper():

 def patch_flex_make_mask():
    is_torch_2_6 = torch.__version__.startswith("2.6")
-    is_transformers_below_4_51 = transformers.__version__ < "4.51.0"
+    is_transformers_eq_4_51 = transformers.__version__ == "4.51.0"

-    if not (is_torch_2_6 and is_transformers_below_4_51):
+    if not (is_torch_2_6 and is_transformers_eq_4_51):
        return

    from torch.nn.attention.flex_attention import (
@@ -66,7 +67,7 @@ def patch_flex_make_mask():

    Offset = Union[torch.Tensor, int]

-    def make_flex_block_causal_mask(
+    def patched_make_flex_block_causal_mask(
        attention_mask_2d: torch.Tensor,
        attention_chunk_size: Optional[int] = None,
        query_length=None,
@@ -157,6 +158,14 @@ def patch_flex_make_mask():
            _compile=True,
        )

+    for n in tuple(sys.modules):
+        if ".modeling_" in n and "llama4" not in n:
+            if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
+                print(n)
+                sys.modules[n].make_flex_block_causal_mask = (
+                    patched_make_flex_block_causal_mask
+                )
+
    transformers.integrations.flex_attention.make_flex_block_causal_mask = (
-        make_flex_block_causal_mask
+        patched_make_flex_block_causal_mask
    )