fix: train_on_inputs: true ignored for sharegpt (#1045) [skip ci]

* fix: `train_on_inputs: true` ignored for sharegpt

* enable unit test for train_on_inputs for sharegpt

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
This commit is contained in:
NanoCode012
2024-01-10 13:00:09 +09:00
committed by GitHub
parent 0f100800e3
commit 043c3860cd
2 changed files with 44 additions and 36 deletions

View File

@@ -104,7 +104,7 @@ class TestSharegpt:
role_key_human=None,
),
tokenizer,
True, # train_on_inputs
False, # train_on_inputs
2048, # sequence_len
)
@@ -124,30 +124,30 @@ class TestSharegpt:
]
# fmt: on
# def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
# strategy = SimpleShareGPTPromptTokenizingStrategy(
# ShareGPTPrompterV2(
# conversation="chatml",
# role_key_model=None,
# role_key_human=None,
# ),
# tokenizer,
# False, # train_on_inputs
# 2048, # sequence_len
# )
#
# dataset_wrapper = TokenizedPromptDataset(
# strategy, sharegpt_dataset, process_count=1
# )
#
# labels = dataset_wrapper[0]["labels"]
# # fmt: off
# assert labels == [
# 1, # bos
# 32001, 1587, 13, 25997, 32000, 28705, 13, # system
# 32001, 2188, 13, 21558, 32000, 28705, 13, # human
# 32001, 13892, 13, 21558, 32000, 28705, 13, # gpt
# 32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human
# 32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt
# ]
# # fmt: on
def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
strategy = SimpleShareGPTPromptTokenizingStrategy(
ShareGPTPrompterV2(
conversation="chatml",
role_key_model=None,
role_key_human=None,
),
tokenizer,
True, # train_on_inputs
2048, # sequence_len
)
dataset_wrapper = TokenizedPromptDataset(
strategy, sharegpt_dataset, process_count=1
)
labels = dataset_wrapper[0]["labels"]
# fmt: off
assert labels == [
1, # bos
32001, 1587, 13, 25997, 32000, 28705, 13, # system
32001, 2188, 13, 21558, 32000, 28705, 13, # human
32001, 13892, 13, 21558, 32000, 28705, 13, # gpt
32001, 2188, 13, 12684, 17664, 32000, 28705, 13, # human
32001, 13892, 13, 12684, 17664, 32000, 28705, 13, # gpt
]
# fmt: on