Merge pull request #255 from OpenAccess-AI-Collective/open-orca-prompts

open orca support
This commit is contained in:
Wing Lian
2023-07-01 01:11:23 -04:00
committed by GitHub
2 changed files with 27 additions and 0 deletions

View File

@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
```json ```json
{"message_1": "...", "message_2": "..."} {"message_1": "...", "message_2": "..."}
``` ```
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
```json
{"system_prompt": "...", "question": "...", "response": "..."}
```
- `context_qa`: in context question answering from an article - `context_qa`: in context question answering from an article
```json ```json
{"article": "...", "question": "...", "answer": "..."} {"article": "...", "question": "...", "answer": "..."}

View File

@@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter):
yield res yield res
class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
"""
Tokenizing strategy for OpenOrca datasets
"""
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
return (
prompt["question"],
"",
prompt["response"],
prompt["system_prompt"],
)
def load(tokenizer, cfg): def load(tokenizer, cfg):
return load_chat(tokenizer, cfg) return load_chat(tokenizer, cfg)
@@ -95,3 +109,12 @@ def load_chat(tokenizer, cfg):
cfg.train_on_inputs, cfg.train_on_inputs,
cfg.sequence_len, cfg.sequence_len,
) )
def load_open_orca(tokenizer, cfg):
return OpenOrcaPromptTokenizingStrategy(
SystemDataPrompter(PromptStyle.INSTRUCT.value),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)