fix: train_on_inputs: true ignored for sharegpt (#1045) [skip ci]
* fix: `train_on_inputs: true` ignored for sharegpt * enable unit test for train_on_inputs for sharegpt --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
@@ -379,10 +379,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
|||||||
add_eos_token=False,
|
add_eos_token=False,
|
||||||
strip_bos_token=True,
|
strip_bos_token=True,
|
||||||
)
|
)
|
||||||
# everything from this is masked out from the labels
|
if self.train_on_inputs:
|
||||||
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
labels = copy.deepcopy(res["input_ids"])
|
||||||
|
else:
|
||||||
|
# everything from this is masked out from the labels
|
||||||
|
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
||||||
elif assistant in role:
|
elif assistant in role:
|
||||||
# TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
|
|
||||||
role = (
|
role = (
|
||||||
role.replace(role_remap[1]["from"], role_remap[1]["to"])
|
role.replace(role_remap[1]["from"], role_remap[1]["to"])
|
||||||
if role_remap
|
if role_remap
|
||||||
@@ -406,18 +408,24 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
|||||||
add_eos_token=False,
|
add_eos_token=False,
|
||||||
strip_bos_token=True,
|
strip_bos_token=True,
|
||||||
)
|
)
|
||||||
# not masked out from labels
|
|
||||||
labels = copy.deepcopy(res["input_ids"])
|
labels = copy.deepcopy(res["input_ids"])
|
||||||
len_role = len(role_res["input_ids"])
|
if not self.train_on_inputs:
|
||||||
labels[:len_role] = [IGNORE_TOKEN_ID] * min(len_role, len(labels))
|
# mask out role tokens from the labels
|
||||||
|
len_role = len(role_res["input_ids"])
|
||||||
|
labels[:len_role] = [IGNORE_TOKEN_ID] * min(
|
||||||
|
len_role, len(labels)
|
||||||
|
)
|
||||||
elif role == "":
|
elif role == "":
|
||||||
turn = content
|
turn = content
|
||||||
# this is only ever the first part, should include the bos token and the user query
|
# this is only ever the first part, should include the bos token and the user query
|
||||||
res = self._tokenize(
|
res = self._tokenize(
|
||||||
turn, add_eos_token=False, strip_bos_token=False
|
turn, add_eos_token=False, strip_bos_token=False
|
||||||
)
|
)
|
||||||
# everything from this is masked out from the labels
|
if self.train_on_inputs:
|
||||||
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
labels = copy.deepcopy(res["input_ids"])
|
||||||
|
else:
|
||||||
|
# everything from this is masked out from the labels
|
||||||
|
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
||||||
else:
|
else:
|
||||||
LOG.warning(f"unhandled role: {role}")
|
LOG.warning(f"unhandled role: {role}")
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ class TestSharegpt:
|
|||||||
role_key_human=None,
|
role_key_human=None,
|
||||||
),
|
),
|
||||||
tokenizer,
|
tokenizer,
|
||||||
True, # train_on_inputs
|
False, # train_on_inputs
|
||||||
2048, # sequence_len
|
2048, # sequence_len
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -124,30 +124,30 @@ class TestSharegpt:
|
|||||||
]
|
]
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
# def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
|
def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
|
||||||
# strategy = SimpleShareGPTPromptTokenizingStrategy(
|
strategy = SimpleShareGPTPromptTokenizingStrategy(
|
||||||
# ShareGPTPrompterV2(
|
ShareGPTPrompterV2(
|
||||||
# conversation="chatml",
|
conversation="chatml",
|
||||||
# role_key_model=None,
|
role_key_model=None,
|
||||||
# role_key_human=None,
|
role_key_human=None,
|
||||||
# ),
|
),
|
||||||
# tokenizer,
|
tokenizer,
|
||||||
# False, # train_on_inputs
|
True, # train_on_inputs
|
||||||
# 2048, # sequence_len
|
2048, # sequence_len
|
||||||
# )
|
)
|
||||||
#
|
|
||||||
# dataset_wrapper = TokenizedPromptDataset(
|
dataset_wrapper = TokenizedPromptDataset(
|
||||||
# strategy, sharegpt_dataset, process_count=1
|
strategy, sharegpt_dataset, process_count=1
|
||||||
# )
|
)
|
||||||
#
|
|
||||||
# labels = dataset_wrapper[0]["labels"]
|
labels = dataset_wrapper[0]["labels"]
|
||||||
# # fmt: off
|
# fmt: off
|
||||||
# assert labels == [
|
assert labels == [
|
||||||
# 1, # bos
|
1, # bos
|
||||||
# 32001, 1587, 13, 25997, 32000, 28705, 13, # system
|
32001, 1587, 13, 25997, 32000, 28705, 13, # system
|
||||||
# 32001, 2188, 13, 21558, 32000, 28705, 13, # human
|
32001, 2188, 13, 21558, 32000, 28705, 13, # human
|
||||||
# 32001, 13892, 13, 21558, 32000, 28705, 13, # gpt
|
32001, 13892, 13, 21558, 32000, 28705, 13, # gpt
|
||||||
# 32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human
|
32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human
|
||||||
# 32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt
|
32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt
|
||||||
# ]
|
]
|
||||||
# # fmt: on
|
# fmt: on
|
||||||
|
|||||||
Reference in New Issue
Block a user