Flash attn hotfix (#951)
* use previous arg * use eager to use legacy attention that can be patched
This commit is contained in:
@@ -324,6 +324,10 @@ def load_model(
|
|||||||
model_config._attn_implementation = ( # pylint: disable=protected-access
|
model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||||
"flash_attention_2"
|
"flash_attention_2"
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||||
|
"eager"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
|
if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
|
||||||
|
|||||||
Reference in New Issue
Block a user