From 78a1e1fa12b7b4698328a21e15abbc0958e8babf Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 1 Jul 2023 00:19:41 -0400 Subject: [PATCH] open orca support --- README.md | 4 ++++ .../prompt_strategies/alpaca_w_system.py | 23 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/README.md b/README.md index 27aec72db..4929987cb 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended): ```json {"message_1": "...", "message_2": "..."} ``` +- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct + ```json + {"system_prompt": "...", "question": "...", "response": "..."} + ``` - `context_qa`: in context question answering from an article ```json {"article": "...", "question": "...", "answer": "..."} diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py index aacae8739..1b4f50219 100644 --- a/src/axolotl/prompt_strategies/alpaca_w_system.py +++ b/src/axolotl/prompt_strategies/alpaca_w_system.py @@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter): yield res +class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy): + """ + Tokenizing strategy for OpenOrca datasets + """ + + def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: + return ( + prompt["question"], + "", + prompt["response"], + prompt["system_prompt"], + ) + + def load(tokenizer, cfg): return InstructionWSystemPromptTokenizingStrategy( SystemDataPrompter(PromptStyle.CHAT.value), @@ -82,3 +96,12 @@ def load(tokenizer, cfg): cfg.train_on_inputs, cfg.sequence_len, ) + + +def load_open_orca(tokenizer, cfg): + return OpenOrcaPromptTokenizingStrategy( + SystemDataPrompter(PromptStyle.INSTRUCT.value), + tokenizer, + cfg.train_on_inputs, + cfg.sequence_len, + )