fix: train_on_inputs: true ignored for sharegpt (#1045) [skip ci]

* fix: `train_on_inputs: true` ignored for sharegpt * enable unit test for train_on_inputs for sharegpt --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-01-10 13:00:09 +09:00
parent 0f100800e3
commit 043c3860cd
2 changed files with 44 additions and 36 deletions
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -379,10 +379,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                        add_eos_token=False,
                        strip_bos_token=True,
                    )
-                    # everything from this is masked out from the labels
-                    labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                    if self.train_on_inputs:
+                        labels = copy.deepcopy(res["input_ids"])
+                    else:
+                        # everything from this is masked out from the labels
+                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                elif assistant in role:
-                    # TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
                    role = (
                        role.replace(role_remap[1]["from"], role_remap[1]["to"])
                        if role_remap
@@ -406,18 +408,24 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                        add_eos_token=False,
                        strip_bos_token=True,
                    )
-                    # not masked out from labels
                    labels = copy.deepcopy(res["input_ids"])
-                    len_role = len(role_res["input_ids"])
-                    labels[:len_role] = [IGNORE_TOKEN_ID] * min(len_role, len(labels))
+                    if not self.train_on_inputs:
+                        # mask out role tokens from the labels
+                        len_role = len(role_res["input_ids"])
+                        labels[:len_role] = [IGNORE_TOKEN_ID] * min(
+                            len_role, len(labels)
+                        )
                elif role == "":
                    turn = content
                    # this is only ever the first part, should include the bos token and the user query
                    res = self._tokenize(
                        turn, add_eos_token=False, strip_bos_token=False
                    )
-                    # everything from this is masked out from the labels
-                    labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                    if self.train_on_inputs:
+                        labels = copy.deepcopy(res["input_ids"])
+                    else:
+                        # everything from this is masked out from the labels
+                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                else:
                    LOG.warning(f"unhandled role: {role}")
                    continue
--- a/tests/prompt_strategies/test_sharegpt.py
+++ b/tests/prompt_strategies/test_sharegpt.py
@@ -104,7 +104,7 @@ class TestSharegpt:
                role_key_human=None,
            ),
            tokenizer,
-            True,  # train_on_inputs
+            False,  # train_on_inputs
            2048,  # sequence_len
        )

@@ -124,30 +124,30 @@ class TestSharegpt:
        ]
        # fmt: on

-    # def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
-    #     strategy = SimpleShareGPTPromptTokenizingStrategy(
-    #         ShareGPTPrompterV2(
-    #             conversation="chatml",
-    #             role_key_model=None,
-    #             role_key_human=None,
-    #         ),
-    #         tokenizer,
-    #         False,  # train_on_inputs
-    #         2048,  # sequence_len
-    #     )
-    #
-    #     dataset_wrapper = TokenizedPromptDataset(
-    #         strategy, sharegpt_dataset, process_count=1
-    #     )
-    #
-    #     labels = dataset_wrapper[0]["labels"]
-    #     # fmt: off
-    #     assert labels == [
-    #         1,   # bos
-    #         32001, 1587, 13, 25997, 32000, 28705, 13,  # system
-    #         32001, 2188, 13, 21558, 32000, 28705, 13,  # human
-    #         32001, 13892, 13, 21558, 32000, 28705, 13,  # gpt
-    #         32001, 2188, 13, 12684, 17664, 32000, 28705, 13,   # human
-    #         32001, 13892, 13, 12684, 17664, 32000, 28705, 13,  # gpt
-    #     ]
-    #     # fmt: on
+    def test_no_train_on_input(self, sharegpt_dataset, tokenizer):
+        strategy = SimpleShareGPTPromptTokenizingStrategy(
+            ShareGPTPrompterV2(
+                conversation="chatml",
+                role_key_model=None,
+                role_key_human=None,
+            ),
+            tokenizer,
+            True,  # train_on_inputs
+            2048,  # sequence_len
+        )
+
+        dataset_wrapper = TokenizedPromptDataset(
+            strategy, sharegpt_dataset, process_count=1
+        )
+
+        labels = dataset_wrapper[0]["labels"]
+        # fmt: off
+        assert labels == [
+            1,   # bos
+            32001, 1587, 13, 25997, 32000, 28705, 13,  # system
+            32001, 2188, 13, 21558, 32000, 28705, 13,  # human
+            32001, 13892, 13, 21558, 32000, 28705, 13,  # gpt
+            32001, 2188, 13, 12684, 17664, 32000, 28705, 13,   # human
+            32001, 13892, 13, 12684, 17664, 32000, 28705, 13,  # gpt
+        ]
+        # fmt: on