misc sharegpt fixes (#723)

* support for sharegpt with assistant talking first, better masking of assistant token, allow remap of roles from dataset

* invalid role is actually not possible

* update tokenized fixture for corrected labels
This commit is contained in:
Wing Lian
2023-10-13 11:04:39 -04:00
committed by GitHub
parent bfbdba8614
commit f30afe4544
4 changed files with 107 additions and 36 deletions

View File

@@ -274,9 +274,11 @@ class ShareGPTPrompter: # pylint: disable=too-few-public-methods
raise err
conv.messages = []
for j, sentence in enumerate(source):
for _, sentence in enumerate(source):
role = roles[sentence["from"]]
if role != conv.roles[j % 2]:
if len(conv.messages) > 0 and (
(role == conv.messages[-1][0]) or (role not in conv.roles)
):
LOG.warning(f"{SHAREGPT_ASSERTION_FAILED_ROLE}: {sentence}")
conv.append_message(role, sentence["value"])