Merge pull request #255 from OpenAccess-AI-Collective/open-orca-prompts
open orca support
This commit is contained in:
@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
|||||||
```json
|
```json
|
||||||
{"message_1": "...", "message_2": "..."}
|
{"message_1": "...", "message_2": "..."}
|
||||||
```
|
```
|
||||||
|
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
|
||||||
|
```json
|
||||||
|
{"system_prompt": "...", "question": "...", "response": "..."}
|
||||||
|
```
|
||||||
- `context_qa`: in context question answering from an article
|
- `context_qa`: in context question answering from an article
|
||||||
```json
|
```json
|
||||||
{"article": "...", "question": "...", "answer": "..."}
|
{"article": "...", "question": "...", "answer": "..."}
|
||||||
|
|||||||
@@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter):
|
|||||||
yield res
|
yield res
|
||||||
|
|
||||||
|
|
||||||
|
class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
|
||||||
|
"""
|
||||||
|
Tokenizing strategy for OpenOrca datasets
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
|
||||||
|
return (
|
||||||
|
prompt["question"],
|
||||||
|
"",
|
||||||
|
prompt["response"],
|
||||||
|
prompt["system_prompt"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load(tokenizer, cfg):
|
def load(tokenizer, cfg):
|
||||||
return load_chat(tokenizer, cfg)
|
return load_chat(tokenizer, cfg)
|
||||||
|
|
||||||
@@ -95,3 +109,12 @@ def load_chat(tokenizer, cfg):
|
|||||||
cfg.train_on_inputs,
|
cfg.train_on_inputs,
|
||||||
cfg.sequence_len,
|
cfg.sequence_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_open_orca(tokenizer, cfg):
|
||||||
|
return OpenOrcaPromptTokenizingStrategy(
|
||||||
|
SystemDataPrompter(PromptStyle.INSTRUCT.value),
|
||||||
|
tokenizer,
|
||||||
|
cfg.train_on_inputs,
|
||||||
|
cfg.sequence_len,
|
||||||
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user