add support for defined train split (#654)

This commit is contained in:
Wing Lian
2023-09-28 20:14:14 -04:00
committed by GitHub
parent 8662e8ffe8
commit 409ca0f21c
3 changed files with 61 additions and 0 deletions

View File

@@ -24,6 +24,15 @@ def load(tokenizer, cfg):
)
def load_v2(tokenizer, cfg):
return ContextQaV2PromptTokenizingStrategy(
ContextV2Prompter(),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
class AlpacaContextPrompter(AlpacaPrompter):
"""
Customized system prompted for concise QA
@@ -50,6 +59,38 @@ class AlpacaContextPromptTokenizingStrategy(InstructionPromptTokenizingStrategy)
)
class ContextQaV2PromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
"""
Tokenization Strategy to combine in-context article with a question and answer
"""
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
return (
"Context: "
+ prompt["context"]
+ "\nQuestion: "
+ prompt["question"]
+ "\n",
"",
"Answer: " + prompt["answer"],
)
class ContextV2Prompter(AlpacaPrompter):
"""
Customized system prompted for concise QA
"""
system_prompt = ""
system_no_input_prompt = ""
def match_prompt_style(self):
# pylint: disable=duplicate-code
self.turn_format = "{instruction}\n{input}"
self.turn_no_input_format = "{instruction}"
self.system_format = "{system}"
class AlpacaMissingInfoContextPromptTokenizingStrategy(
InstructionPromptTokenizingStrategy
):

View File

@@ -247,6 +247,16 @@ def load_tokenized_prepared_datasets(
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
if "train" in ds:
ds = ds["train"]
elif (
isinstance(ds, DatasetDict)
and d.train_on_split
and d.train_on_split in ds
):
ds = ds[d.train_on_split]
elif isinstance(ds, DatasetDict):
raise ValueError(
f"no train split found for dataset {d.path}, you may specify a split with 'train_on_split: `"
)
if (
"input_ids" in ds.features
and "attention_mask" in ds.features