Mistral flash attn packing (#646)

* add mistral monkeypatch

* add arg for decoder attention masl

* fix lint for duplicate code

* make sure to update transformers too

* tweak install for e2e

* move mistral patch to conditional
This commit is contained in:
Wing Lian
2023-09-27 18:41:00 -04:00
committed by GitHub
parent 85b0be2ba7
commit b6ab8aad62
4 changed files with 412 additions and 4 deletions

View File

@@ -150,6 +150,14 @@ def load_model(
# Note: This might overwrite previous additional_special_tokens
tokenizer.add_special_tokens({"additional_special_tokens": [MEM_TOKEN]})
if cfg.is_mistral_derived_model and cfg.flash_attention:
from axolotl.monkeypatch.mistral_attn_hijack_flash import (
replace_mistral_attn_with_flash_attn,
)
LOG.info("patching with flash attention")
replace_mistral_attn_with_flash_attn(packed=cfg.sample_packing)
if cfg.is_llama_derived_model and cfg.xpos_rope:
from axolotl.monkeypatch.xpos_rope_llama_monkey_patch import (
replace_llama_rope_with_xpos_rope,