Add Glaive conversation format support (#1365)

* Add Glaive conversation format support * fix black formatting errors * Fix black and pylint formatting errors * only set role_key_tool if provided in the dataset constructor * Update src/axolotl/prompt_strategies/sharegpt.py Co-authored-by: Wing Lian <wing.lian@gmail.com> * sharegpt test * tokenizer test * fix formatting --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-03-10 19:50:25 -05:00
parent b0ee9ec734
commit b7d8a7dc4d
6 changed files with 184 additions and 3 deletions
--- a/src/axolotl/prompt_strategies/sharegpt.py
+++ b/src/axolotl/prompt_strategies/sharegpt.py
@@ -1,10 +1,15 @@
 """Module containing the SimpleShareGPTPromptTokenizingStrategy class"""
+
 from typing import Any, Dict, Optional

 from fastchat.conversation import Conversation, SeparatorStyle, register_conv_template

 from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
 from axolotl.prompters import ShareGPTPrompterV2
+from axolotl.utils.tokenization import (
+    chatml_to_conversation,
+    merge_consecutive_messages,
+)


 def register_chatml_template(system_message=None):
@@ -19,6 +24,16 @@ def register_chatml_template(system_message=None):
            sep="<|im_end|>",
        )
    )
+    register_conv_template(
+        Conversation(
+            name="chatml_glaive",
+            system_template="<|im_start|>system\n{system_message}",
+            system_message=system_message,
+            roles=["<|im_start|>user", "<|im_start|>assistant", "<|im_start|>tool"],
+            sep_style=SeparatorStyle.CHATML,
+            sep="<|im_end|>",
+        )
+    )


 def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
@@ -77,6 +92,20 @@ def load_guanaco(tokenizer, cfg):
    )


+def load_glaive(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
+    conversation = (
+        ds_cfg["conversation"]
+        if ds_cfg and "conversation" in ds_cfg
+        else "chatml_glaive"
+    )
+    return GlaiveShareGPTPromptTokenizingStrategy(
+        ShareGPTPrompterV2(conversation=conversation),
+        tokenizer,
+        cfg.train_on_inputs,
+        cfg.sequence_len,
+    )
+
+
 class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy):
    """
    basic sharegpt strategy to grab conversations from the sample row
@@ -158,3 +187,15 @@ class UltrachatShareGPTPromptTokenizingStrategy(SimpleShareGPTPromptTokenizingSt
            {"from": role_map[t["role"]], "value": t["content"]} for t in conversations
        ]
        return turns
+
+
+class GlaiveShareGPTPromptTokenizingStrategy(SimpleShareGPTPromptTokenizingStrategy):
+    """
+    sharegpt strategy that remaps glaive data to sharegpt format
+    """
+
+    def get_conversation_thread(self, prompt):
+        conversation = chatml_to_conversation(prompt)
+        conversation = merge_consecutive_messages(conversation)
+
+        return conversation
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -360,11 +360,19 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    LOG.warning(f"expected tuple, got {part}")
                    continue

-                user, assistant = conversation.roles
+                tool_role_label = None
+                if len(conversation.roles) == 3:
+                    (
+                        user_role_label,
+                        assistant_role_label,
+                        tool_role_label,
+                    ) = conversation.roles
+                else:
+                    user_role_label, assistant_role_label = conversation.roles
                role, content = part

                # Uses "in" because role contains extra characters
-                if user in role:
+                if user_role_label in role:
                    role = (
                        role.replace(role_remap[0]["from"], role_remap[0]["to"])
                        if role_remap
@@ -384,7 +392,7 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    else:
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
-                elif assistant in role:
+                elif assistant_role_label in role:
                    role = (
                        role.replace(role_remap[1]["from"], role_remap[1]["to"])
                        if role_remap
@@ -426,6 +434,8 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
                    else:
                        # everything from this is masked out from the labels
                        labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
+                elif tool_role_label and tool_role_label in role:
+                    labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
                else:
                    LOG.warning(f"unhandled role: {role}")
                    continue
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -267,6 +267,8 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods

    role_key_human = "human"
    role_key_model = "gpt"
+    # Optional, only used for tool usage datasets.
+    role_key_tool = None

    def __init__(
        self,
@@ -274,6 +276,7 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
        conversation: Optional[Union[str, Conversation]] = None,
        role_key_human: Optional[str] = None,
        role_key_model: Optional[str] = None,
+        role_key_tool: Optional[str] = None,
    ):
        if conversation:
            if isinstance(conversation, Conversation):
@@ -286,6 +289,8 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
            self.role_key_human = role_key_human
        if role_key_model:
            self.role_key_model = role_key_model
+        if role_key_tool:
+            self.role_key_tool = role_key_tool

    def _build_result(self, source):
        if len(source) < 2:
@@ -303,6 +308,8 @@ class ShareGPTPrompter(Prompter):  # pylint: disable=too-few-public-methods
            source.pop(0)

        roles = {self.role_key_human: conv.roles[0], self.role_key_model: conv.roles[1]}
+        if self.role_key_tool:
+            roles[self.role_key_tool] = conv.roles[2]

        try:
            # Apply prompt templates
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -2,6 +2,8 @@


 import logging
+import re
+from typing import Dict, List

 from termcolor import colored

@@ -36,3 +38,65 @@ def check_example_labels(example, tokenizer, text_only=False):
    LOG.info("\n\n\n")

    return " ".join(colored_tokens)
+
+
+GLAIVE_ROLES = ["USER", "ASSISTANT", "FUNCTION RESPONSE"]
+GLAIVE_TO_SHAREGPT_ROLE = {
+    "SYSTEM": "system",
+    "USER": "human",
+    "ASSISTANT": "gpt",
+    "FUNCTION RESPONSE": "tool",
+}
+
+GLAIVE_MSG_REGEX = re.compile(rf"({'|'.join(GLAIVE_ROLES)}): ")
+
+
+def chatml_to_conversation(row: Dict[str, str]) -> List[Dict[str, str]]:
+    """
+    Converts a ChatML formatted row to a list of messages in ShareGPT format.
+    Initially based off https://github.com/lilacai/lilac/blob/main/notebooks/GlaiveToShareGPT.ipynb.
+    """
+
+    system_prompt = row.get("system")
+    if system_prompt:
+        system_prompt = system_prompt.removeprefix("SYSTEM: ")
+
+    chat_str = row["chat"]
+    chat_msgs = [s.strip() for s in GLAIVE_MSG_REGEX.split(chat_str) if s]
+
+    chat_msg_dicts = [
+        {"from": GLAIVE_TO_SHAREGPT_ROLE[role], "value": value}
+        for role, value in zip(chat_msgs[::2], chat_msgs[1::2])
+    ]
+
+    if system_prompt:
+        chat_msg_dicts = [
+            {"from": GLAIVE_TO_SHAREGPT_ROLE["SYSTEM"], "value": system_prompt}
+        ] + chat_msg_dicts
+
+    return chat_msg_dicts
+
+
+def merge_consecutive_messages(messages):
+    """
+    Merge consecutive messages from the same sender into a single message.
+    This can be useful with datasets that contain multiple consecutive tool calls.
+    """
+
+    merged_messages = []
+    current_from = None
+    current_message = ""
+
+    for msg in messages:
+        if current_from == msg["from"]:
+            current_message += msg["value"]
+        else:
+            if current_from is not None:
+                merged_messages.append({"from": current_from, "value": current_message})
+            current_from = msg["from"]
+            current_message = msg["value"]
+
+    if current_from is not None:
+        merged_messages.append({"from": current_from, "value": current_message})
+
+    return merged_messages