diff --git a/README.md b/README.md index 27aec72db..4929987cb 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended): ```json {"message_1": "...", "message_2": "..."} ``` +- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct + ```json + {"system_prompt": "...", "question": "...", "response": "..."} + ``` - `context_qa`: in context question answering from an article ```json {"article": "...", "question": "...", "answer": "..."} diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py index aacae8739..1b4f50219 100644 --- a/src/axolotl/prompt_strategies/alpaca_w_system.py +++ b/src/axolotl/prompt_strategies/alpaca_w_system.py @@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter): yield res +class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy): + """ + Tokenizing strategy for OpenOrca datasets + """ + + def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]: + return ( + prompt["question"], + "", + prompt["response"], + prompt["system_prompt"], + ) + + def load(tokenizer, cfg): return InstructionWSystemPromptTokenizingStrategy( SystemDataPrompter(PromptStyle.CHAT.value), @@ -82,3 +96,12 @@ def load(tokenizer, cfg): cfg.train_on_inputs, cfg.sequence_len, ) + + +def load_open_orca(tokenizer, cfg): + return OpenOrcaPromptTokenizingStrategy( + SystemDataPrompter(PromptStyle.INSTRUCT.value), + tokenizer, + cfg.train_on_inputs, + cfg.sequence_len, + )