better handling since xgen tokenizer breaks with convert_tokens_to_ids

This commit is contained in:
Wing Lian
2023-07-21 09:24:11 -04:00
parent 06c61d6f13
commit 2a428e8014

View File

@@ -48,16 +48,22 @@ class PromptTokenizingStrategy(abc.ABC):
@functools.lru_cache(maxsize=128)
def _get_user_token(self):
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
if isinstance(id_or_ids, (int,)):
return id_or_ids
try:
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
if isinstance(id_or_ids, (int,)):
return id_or_ids
except KeyError:
pass
return False
@functools.lru_cache(maxsize=128)
def _get_assistant_token(self):
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
if isinstance(id_or_ids, (int,)):
return id_or_ids
try:
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
if isinstance(id_or_ids, (int,)):
return id_or_ids
except KeyError:
pass
return False
def _tokenize(self, prompt: str, add_eos_token=True, strip_bos_token=False):