From 2598c9f0453d02e57f7701c11b7af4914196a950 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 5 Mar 2024 11:43:33 -0500 Subject: [PATCH] allow the sharegpt handler to also better handle datasets destined for openai finetuning (#1361) * allow the sharegpt handler to also better handle datasets destined for openai finetuning * make sure to support system role --- src/axolotl/prompt_strategies/sharegpt.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/axolotl/prompt_strategies/sharegpt.py b/src/axolotl/prompt_strategies/sharegpt.py index 15bfee8c4..6ac7cbafe 100644 --- a/src/axolotl/prompt_strategies/sharegpt.py +++ b/src/axolotl/prompt_strategies/sharegpt.py @@ -82,7 +82,7 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy): basic sharegpt strategy to grab conversations from the sample row """ - _strict = True + _strict = False @property def strict(self): @@ -96,10 +96,25 @@ class SimpleShareGPTPromptTokenizingStrategy(ShareGPTPromptTokenizingStrategy): conversations = prompt["conversations"] if self.strict: return conversations - # remap roles - allow for assistant turn - role_map = {"human": "human", "assistant": "gpt", "gpt": "gpt"} + role_key = "from" + if "role" in conversations[0].keys(): + role_key = "role" + value_key = "value" + if "text" in conversations[0].keys(): + value_key = "text" + elif "content" in conversations[0].keys(): + value_key = "content" + # remap roles - allow for assistant turn" + role_map = { + "user": "human", + "human": "human", + "assistant": "gpt", + "gpt": "gpt", + "system": "system", + } turns = [ - {"from": role_map[t["from"]], "value": t["value"]} for t in conversations + {"from": role_map[t[role_key]], "value": t[value_key]} + for t in conversations ] return turns