fix: train_on_inputs: true ignored for sharegpt (#1045) [skip ci]

* fix: `train_on_inputs: true` ignored for sharegpt

* enable unit test for train_on_inputs for sharegpt

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
NanoCode012
2024-01-10 13:00:09 +09:00
committed by GitHub
parent 0f100800e3
commit 043c3860cd
2 changed files with 44 additions and 36 deletions

View File

@@ -379,10 +379,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
add_eos_token=False, add_eos_token=False,
strip_bos_token=True, strip_bos_token=True,
) )
# everything from this is masked out from the labels if self.train_on_inputs:
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) labels = copy.deepcopy(res["input_ids"])
else:
# everything from this is masked out from the labels
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
elif assistant in role: elif assistant in role:
# TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
role = ( role = (
role.replace(role_remap[1]["from"], role_remap[1]["to"]) role.replace(role_remap[1]["from"], role_remap[1]["to"])
if role_remap if role_remap
@@ -406,18 +408,24 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
add_eos_token=False, add_eos_token=False,
strip_bos_token=True, strip_bos_token=True,
) )
# not masked out from labels
labels = copy.deepcopy(res["input_ids"]) labels = copy.deepcopy(res["input_ids"])
len_role = len(role_res["input_ids"]) if not self.train_on_inputs:
labels[:len_role] = [IGNORE_TOKEN_ID] * min(len_role, len(labels)) # mask out role tokens from the labels
len_role = len(role_res["input_ids"])
labels[:len_role] = [IGNORE_TOKEN_ID] * min(
len_role, len(labels)
)
elif role == "": elif role == "":
turn = content turn = content
# this is only ever the first part, should include the bos token and the user query # this is only ever the first part, should include the bos token and the user query
res = self._tokenize( res = self._tokenize(
turn, add_eos_token=False, strip_bos_token=False turn, add_eos_token=False, strip_bos_token=False
) )
# everything from this is masked out from the labels if self.train_on_inputs:
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"]) labels = copy.deepcopy(res["input_ids"])
else:
# everything from this is masked out from the labels
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
else: else:
LOG.warning(f"unhandled role: {role}") LOG.warning(f"unhandled role: {role}")
continue continue

View File

@@ -104,7 +104,7 @@ class TestSharegpt:
role_key_human=None, role_key_human=None,
), ),
tokenizer, tokenizer,
True, # train_on_inputs False, # train_on_inputs
2048, # sequence_len 2048, # sequence_len
) )
@@ -124,30 +124,30 @@ class TestSharegpt:
] ]
# fmt: on # fmt: on
# def test_no_train_on_input(self, sharegpt_dataset, tokenizer): def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
# strategy = SimpleShareGPTPromptTokenizingStrategy( strategy = SimpleShareGPTPromptTokenizingStrategy(
# ShareGPTPrompterV2( ShareGPTPrompterV2(
# conversation="chatml", conversation="chatml",
# role_key_model=None, role_key_model=None,
# role_key_human=None, role_key_human=None,
# ), ),
# tokenizer, tokenizer,
# False, # train_on_inputs True, # train_on_inputs
# 2048, # sequence_len 2048, # sequence_len
# ) )
#
# dataset_wrapper = TokenizedPromptDataset( dataset_wrapper = TokenizedPromptDataset(
# strategy, sharegpt_dataset, process_count=1 strategy, sharegpt_dataset, process_count=1
# ) )
#
# labels = dataset_wrapper[0]["labels"] labels = dataset_wrapper[0]["labels"]
# # fmt: off # fmt: off
# assert labels == [ assert labels == [
# 1, # bos 1, # bos
# 32001, 1587, 13, 25997, 32000, 28705, 13, # system 32001, 1587, 13, 25997, 32000, 28705, 13, # system
# 32001, 2188, 13, 21558, 32000, 28705, 13, # human 32001, 2188, 13, 21558, 32000, 28705, 13, # human
# 32001, 13892, 13, 21558, 32000, 28705, 13, # gpt 32001, 13892, 13, 21558, 32000, 28705, 13, # gpt
# 32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human 32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human
# 32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt 32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt
# ] ]
# # fmt: on # fmt: on