fix llama check

This commit is contained in:
Wing Lian
2023-04-18 01:19:53 -04:00
parent 3f3f561c06
commit eb808903e5

View File

@@ -60,12 +60,14 @@ def load_model(base_model, base_model_config, model_type, tokenizer_type, cfg, a
# TODO refactor as a kwarg # TODO refactor as a kwarg
load_in_8bit = cfg.load_in_8bit load_in_8bit = cfg.load_in_8bit
tokenizer = None tokenizer = None
is_llama_derived_model = "llama" in base_model or "llama" in cfg.model_type.lower()
if adapter != "lora": if adapter != "lora":
raise NotImplementedError(f"{adapter} peft adapter not available") raise NotImplementedError(f"{adapter} peft adapter not available")
if "llama" in base_model and cfg.flash_attention: if is_llama_derived_model and cfg.flash_attention:
if cfg.device not in ["mps", "cpu"] and inference is False: if cfg.device not in ["mps", "cpu"] and inference is False:
from axolotl.flash_attn import replace_llama_attn_with_flash_attn from axolotl.flash_attn import replace_llama_attn_with_flash_attn
logging.info("patching with flash attention")
replace_llama_attn_with_flash_attn() replace_llama_attn_with_flash_attn()
torch_dtype = torch.float16 if cfg.load_in_8bit or cfg.fp16 else torch.float32, torch_dtype = torch.float16 if cfg.load_in_8bit or cfg.fp16 else torch.float32,
@@ -85,7 +87,7 @@ def load_model(base_model, base_model_config, model_type, tokenizer_type, cfg, a
raise e raise e
try: try:
if cfg.load_4bit and ("llama" in base_model or "llama" in cfg.model_type.lower()): if cfg.load_4bit and is_llama_derived_model:
from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
@@ -104,7 +106,7 @@ def load_model(base_model, base_model_config, model_type, tokenizer_type, cfg, a
is_v1_model=cfg.gptq_model_v1 if cfg.gptq_model_v1 is not None else True, is_v1_model=cfg.gptq_model_v1 if cfg.gptq_model_v1 is not None else True,
) )
load_in_8bit = False load_in_8bit = False
elif "llama" in base_model: elif is_llama_derived_model:
model = LlamaForCausalLM.from_pretrained( model = LlamaForCausalLM.from_pretrained(
base_model, base_model,
load_in_8bit=cfg.load_in_8bit, load_in_8bit=cfg.load_in_8bit,
@@ -128,13 +130,18 @@ def load_model(base_model, base_model_config, model_type, tokenizer_type, cfg, a
if not tokenizer: if not tokenizer:
try: try:
if "llama" in base_model: if is_llama_derived_model:
tokenizer = LlamaTokenizer.from_pretrained(model) tokenizer = LlamaTokenizer.from_pretrained(model)
else: else:
tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model) tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model)
except: except:
tokenizer = AutoTokenizer.from_pretrained(base_model) tokenizer = AutoTokenizer.from_pretrained(base_model)
logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
logging.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
logging.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
logging.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
if tokenizer.__class__.__name__ in ["LlamaTokenizer", "LlamaTokenizerFast"]: if tokenizer.__class__.__name__ in ["LlamaTokenizer", "LlamaTokenizerFast"]:
tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN
@@ -144,6 +151,7 @@ def load_model(base_model, base_model_config, model_type, tokenizer_type, cfg, a
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
if load_in_8bit and not cfg.load_4bit: if load_in_8bit and not cfg.load_4bit:
logging.info("converting model w/ prepare_model_for_int8_training")
model = prepare_model_for_int8_training(model) model = prepare_model_for_int8_training(model)
lora_config = LoraConfig( lora_config = LoraConfig(