fix double eos token for chatml (#1054) [skip ci]

* fix double eos token for chatml

* isolate fix to chatml conversation

* fix add special tokens to include rstrip

* add test for train_on_inputs for sharegpt

* don't use rstrip for chatml
This commit is contained in:
Wing Lian
2024-01-09 09:33:38 -05:00
committed by GitHub
parent 04b978b428
commit 651b7a31fc
2 changed files with 158 additions and 1 deletions

View File

@@ -392,9 +392,13 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
# this should be the assistant response, should end with an eos token
if not content.strip():
LOG.warning(f"assistant turn has empty text: {prompt}")
add_eos_token = not (
conversation.name == "chatml"
and conversation.sep == self.tokenizer.eos_token
)
res = self._tokenize(
turn,
add_eos_token=True,
add_eos_token=add_eos_token,
strip_bos_token=True,
)
role_res = self._tokenize(