most model types now support flash attention 2 regardless of multipack support (#1854)

This commit is contained in:
Wing Lian
2024-08-22 16:39:23 -04:00
committed by GitHub
parent b33dc07a77
commit fefa95e350
2 changed files with 5 additions and 10 deletions

View File

@@ -17,6 +17,7 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
"qwen2_moe",
"falcon",
"phi",
"phi3",
"gemma",
"gemma2",
"gemmoe",

View File

@@ -591,16 +591,10 @@ def load_model(
"flash_attention_2"
)
else:
if model_config.model_type in SUPPORTED_MULTIPACK_MODEL_TYPES:
model_kwargs["attn_implementation"] = "flash_attention_2"
model_config._attn_implementation = ( # pylint: disable=protected-access
"flash_attention_2"
)
else:
model_kwargs["attn_implementation"] = "eager"
model_config._attn_implementation = ( # pylint: disable=protected-access
"eager"
)
model_kwargs["attn_implementation"] = "flash_attention_2"
model_config._attn_implementation = ( # pylint: disable=protected-access
"flash_attention_2"
)
elif cfg.sdp_attention:
model_kwargs["attn_implementation"] = "sdpa"
model_config._attn_implementation = "sdpa" # pylint: disable=protected-access