misc sharegpt fixes (#723)

* support for sharegpt with assistant talking first, better masking of assistant token, allow remap of roles from dataset * invalid role is actually not possible * update tokenized fixture for corrected labels
2023-10-13 11:04:39 -04:00
parent bfbdba8614
commit f30afe4544
4 changed files with 107 additions and 36 deletions
--- a/tests/fixtures/conversation.tokenized.json
+++ b/tests/fixtures/conversation.tokenized.json
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -90,6 +90,73 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
            strat.tokenize_prompt(conversation)
            assert "assistant turn has empty text" in self._caplog.records[1].message

+    def test_sharegpt_warnings_turns(self):
+        conversation = {
+            "conversations": [
+                {"from": "system", "value": "lorem"},
+                {"from": "gpt", "value": "ipsum"},
+                {"from": "human", "value": "dolor"},
+                {"from": "human", "value": "dolor"},
+                {"from": "gpt", "value": "sit"},
+            ]
+        }
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            strat.tokenize_prompt(conversation)
+            assert (
+                "Role did not alternate between turns (gpt and human)"
+                in self._caplog.records[0].message
+            )
+
+    def test_sharegpt_changes_roles(self):
+        conversation = {
+            "roles": ["USER", "CHARACTER"],
+            "conversations": [
+                {"from": "system", "value": "lorem"},
+                {"from": "gpt", "value": "ipsum"},
+                {"from": "human", "value": "dolor"},
+                {"from": "gpt", "value": "sit"},
+            ],
+        }
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            res = strat.tokenize_prompt(conversation)
+            assert "CHARACTER" in self.tokenizer.decode(res["input_ids"])
+
+    def test_sharegpt_assistant_label_ignore(self):
+        conversation = {
+            "roles": ["user", "assistant"],
+            "conversations": [
+                {"from": "system", "value": "lorem"},
+                {"from": "gpt", "value": "ipsum"},
+                {"from": "human", "value": "dolor"},
+                {"from": "gpt", "value": "sit"},
+            ],
+        }
+        prompter = ShareGPTPrompterV2()
+        strat = ShareGPTPromptTokenizingStrategy(
+            prompter,
+            self.tokenizer,
+            False,
+            2048,
+        )
+        with self._caplog.at_level(logging.WARNING):
+            res = strat.tokenize_prompt(conversation)
+            idx = res["input_ids"].index(20255)  # assistant token
+            assert res["labels"][idx] == -100
+
    def test_no_sys_prompt(self):
        """
        tests the interface between the user and assistant parts