add support for CP + torch SDPA
This commit is contained in:
@@ -84,7 +84,9 @@ class PatchManager:
|
||||
patch_evaluation_loop()
|
||||
patch_maybe_log_save_evaluate()
|
||||
|
||||
if self.cfg.context_parallel_size > 1:
|
||||
if self.cfg.context_parallel_size > 1 and getattr(
|
||||
self.cfg, "flash_attention", False
|
||||
):
|
||||
from axolotl.monkeypatch.transformers.trainer_context_parallel import (
|
||||
patch_prepare_context_parallel_inputs,
|
||||
)
|
||||
|
||||
@@ -13,21 +13,10 @@ from typing import Callable
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import transformers
|
||||
import transformers.modeling_flash_attention_utils
|
||||
import transformers.modeling_flash_attention_utils as flash_utils
|
||||
from ring_flash_attn import ring_flash_attn_func
|
||||
from ring_flash_attn.adapters.hf_adapter import check_params
|
||||
from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or_equal
|
||||
|
||||
try:
|
||||
from transformers.modeling_flash_attention_utils import _flash_supports_window
|
||||
except ImportError:
|
||||
try:
|
||||
from transformers.modeling_flash_attention_utils import (
|
||||
_flash_supports_window_size as _flash_supports_window,
|
||||
)
|
||||
except ImportError:
|
||||
_flash_supports_window = True
|
||||
|
||||
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
|
||||
|
||||
from axolotl.utils.schemas.enums import RingAttnFunc
|
||||
@@ -118,7 +107,7 @@ def create_flash_attn_forward_varlen_llama3(
|
||||
|
||||
# Handle sliding window
|
||||
use_sliding_windows = (
|
||||
_flash_supports_window
|
||||
_flash_windows_supported()
|
||||
and sliding_window is not None
|
||||
and key_states.shape[1] > sliding_window
|
||||
)
|
||||
@@ -194,3 +183,18 @@ def substitute_hf_flash_attn(
|
||||
from ring_flash_attn.adapters.hf_adapter import flash_attention_forward
|
||||
|
||||
ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
|
||||
|
||||
|
||||
def _flash_windows_supported() -> bool:
|
||||
"""Return whether current transformers build advertises sliding-window support."""
|
||||
support = getattr(flash_utils, "_flash_supports_window", None)
|
||||
if support is None:
|
||||
support = getattr(flash_utils, "_flash_supports_window_size", None)
|
||||
|
||||
if support is None:
|
||||
return True
|
||||
|
||||
if callable(support):
|
||||
return True
|
||||
|
||||
return bool(support)
|
||||
|
||||
@@ -13,18 +13,9 @@ from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
import transformers.modeling_flash_attention_utils as flash_utils
|
||||
from torch.distributed import DeviceMesh
|
||||
|
||||
try:
|
||||
from transformers.modeling_flash_attention_utils import _flash_supports_window
|
||||
except ImportError:
|
||||
try:
|
||||
from transformers.modeling_flash_attention_utils import (
|
||||
_flash_supports_window_size as _flash_supports_window,
|
||||
)
|
||||
except ImportError:
|
||||
_flash_supports_window = True
|
||||
|
||||
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
|
||||
from axolotl.utils.logging import get_logger
|
||||
from axolotl.utils.schemas.enums import RingAttnFunc
|
||||
@@ -83,7 +74,7 @@ def create_ring_flash_attention_forward(
|
||||
|
||||
# Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
|
||||
use_sliding_windows = (
|
||||
_flash_supports_window
|
||||
_flash_windows_supported()
|
||||
and sliding_window is not None
|
||||
and key_states.shape[1] > sliding_window
|
||||
)
|
||||
@@ -225,3 +216,19 @@ def update_ring_attn_params(position_ids: torch.Tensor | None):
|
||||
cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
|
||||
cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
|
||||
update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())
|
||||
|
||||
|
||||
def _flash_windows_supported() -> bool:
|
||||
"""Best-effort check for FlashAttention sliding-window support."""
|
||||
support = getattr(flash_utils, "_flash_supports_window", None)
|
||||
if support is None:
|
||||
support = getattr(flash_utils, "_flash_supports_window_size", None)
|
||||
|
||||
if support is None:
|
||||
return True
|
||||
|
||||
if callable(support):
|
||||
# Signature differs across versions; assume support when callable.
|
||||
return True
|
||||
|
||||
return bool(support)
|
||||
|
||||
@@ -179,7 +179,11 @@ def execute_training(
|
||||
)
|
||||
)
|
||||
|
||||
if cfg.context_parallel_size > 1:
|
||||
use_flash_cp = cfg.context_parallel_size > 1 and bool(
|
||||
getattr(cfg, "flash_attention", False)
|
||||
)
|
||||
|
||||
if use_flash_cp:
|
||||
models = [trainer.model]
|
||||
if hasattr(trainer, "ref_model") and trainer.ref_model:
|
||||
models.append(trainer.ref_model)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
"""Module with validation methods for config pydantic model."""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
@@ -1314,50 +1313,40 @@ class ComplexValidationMixin:
|
||||
if not self.context_parallel_size:
|
||||
self.context_parallel_size = 1
|
||||
elif self.context_parallel_size > 1:
|
||||
if not self.flash_attention:
|
||||
use_flash_attention = getattr(self, "flash_attention", False)
|
||||
use_sdp_attention = getattr(self, "sdp_attention", False)
|
||||
|
||||
if not (use_flash_attention or use_sdp_attention):
|
||||
raise ValueError(
|
||||
"flash_attention: true must be set with context_parallel_size > 1"
|
||||
"context_parallel_size > 1 requires either flash_attention: true "
|
||||
"or sdp_attention: true"
|
||||
)
|
||||
|
||||
if self.sample_packing and self.micro_batch_size > 1:
|
||||
raise ValueError(
|
||||
"micro_batch_size must be set to 1 when sample_packing is enabled "
|
||||
"due to a `ring-flash-attn` requirement"
|
||||
if use_flash_attention:
|
||||
if self.sample_packing and self.micro_batch_size > 1:
|
||||
raise ValueError(
|
||||
"micro_batch_size must be set to 1 when sample_packing is enabled "
|
||||
"due to a `ring-flash-attn` requirement"
|
||||
)
|
||||
|
||||
try:
|
||||
import ring_flash_attn # noqa: F401 # Required after monkey-patching
|
||||
except ImportError as exception:
|
||||
raise ImportError(
|
||||
"context_parallel_size > 1 but ring_flash_attn is not installed. "
|
||||
"Please install it with `pip install axolotl[ring-flash-attn] "
|
||||
"or `pip install ring-flash-attn>=0.1.4`."
|
||||
) from exception
|
||||
|
||||
LOG.warning(
|
||||
"Sequence parallelism (SP) is enabled with "
|
||||
f"context_parallel_size={self.context_parallel_size}. "
|
||||
"Please note that logged losses may differ slightly to the non-SP "
|
||||
"losses due to transformers Trainer implementation details. "
|
||||
"Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
|
||||
"for more details."
|
||||
)
|
||||
|
||||
try:
|
||||
import transformers.modeling_flash_attention_utils
|
||||
from transformers.utils import is_flash_attn_greater_or_equal
|
||||
|
||||
transformers.modeling_flash_attention_utils._flash_supports_window = (
|
||||
True
|
||||
)
|
||||
sys.modules[
|
||||
"transformers.modeling_flash_attention_utils"
|
||||
]._flash_supports_window = True
|
||||
sys.modules[
|
||||
"transformers.modeling_flash_attention_utils"
|
||||
]._flash_supports_window_size = True
|
||||
sys.modules[
|
||||
"transformers.modeling_flash_attention_utils"
|
||||
].is_flash_attn_greater_or_equal = is_flash_attn_greater_or_equal
|
||||
import ring_flash_attn # noqa: F401 # Required after monkey-patching
|
||||
except ImportError as exception:
|
||||
raise ImportError(
|
||||
"context_parallel_size > 1 but ring_flash_attn is not installed. "
|
||||
"Please install it with `pip install axolotl[ring-flash-attn] "
|
||||
"or `pip install ring-flash-attn>=0.1.4`."
|
||||
) from exception
|
||||
|
||||
LOG.warning(
|
||||
"Sequence parallelism (SP) is enabled with "
|
||||
f"context_parallel_size={self.context_parallel_size}. "
|
||||
"Please note that logged losses may differ slightly to the non-SP "
|
||||
"losses due to transformers Trainer implementation details. "
|
||||
"Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
|
||||
"for more details."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
@model_validator(mode="after")
|
||||
|
||||
Reference in New Issue
Block a user