optionally be able to specify alpaca or chat style prompts

This commit is contained in:
Wing Lian
2023-05-20 18:16:22 -04:00
parent fa8bd14be4
commit 1d5ab84486
6 changed files with 223 additions and 53 deletions

View File

@@ -1,5 +1,7 @@
import abc
import copy
import functools
import logging
from transformers import PreTrainedTokenizer
@@ -33,6 +35,20 @@ class PromptTokenizingStrategy(abc.ABC):
def tokenize_prompt(self, prompt):
pass
@functools.cache
def _get_user_token(self):
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
if type(id_or_ids, (int,)):
return id_or_ids
return False
@functools.cache
def _get_assistant_token(self):
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
if type(id_or_ids, (int,)):
return id_or_ids
return False
class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str):
@@ -63,7 +79,7 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
response,
)))
def _tokenize(self, prompt, add_eos_token=True):
def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
result = self.tokenizer(
prompt,
truncation=True,
@@ -79,6 +95,13 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
result["input_ids"].append(self.tokenizer.eos_token_id)
result["attention_mask"].append(1)
if (
result["input_ids"][0] == self.tokenizer.bos_token_id
and strip_bos_token
):
result["input_ids"] = result["input_ids"][1:]
result["attention_mask"] = result["attention_mask"][1:]
result["labels"] = result["input_ids"].copy()
return result
@@ -239,23 +262,34 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
"labels": [],
}
current_len = 0
user_token = self._get_user_token()
assistant_token = self._get_assistant_token()
try:
for i, part in enumerate(self.prompter.build_prompt(prompt["conversations"], self.tokenizer)):
if i == 0:
for i, part in enumerate(self.prompter.build_prompt(prompt["conversations"])):
if isinstance(part, tuple):
if part[0] == "USER:":
part = part[0] + part[1] if not user_token else part[1]
# this is still the user query, we should
res = self._tokenize(part.strip(), add_eos_token=False, strip_bos_token=True)
if user_token:
res = [user_token, *res]
# everything from this is masked out from the labels
labels = [ IGNORE_TOKEN_ID ] * len(res["input_ids"])
elif part[0] == "ASSISTANT:":
part = part[0] + part[1] if not assistant_token else part[1]
# this should be the assistent response, should end with an eos token
res = self._tokenize(part.strip(), add_eos_token=True, strip_bos_token=True)
if assistant_token:
res = [assistant_token, *res]
# not masked out from labels
labels = copy.deepcopy(res["input_ids"])
else:
logging.warning("unhandled role: " + part[0])
else:
# this is only ever the first part, should include the bos token and the user query
res = self._tokenize(part.strip(), add_eos_token=False, strip_bos_token=False)
# everything from this is masked out from the labels
labels = [ IGNORE_TOKEN_ID ] * len(res["input_ids"])
elif i % 2 == 0:
# this is still the user query, we should
res = self._tokenize(part.strip(), add_eos_token=False, strip_bos_token=True)
# everything from this is masked out from the labels
labels = [ IGNORE_TOKEN_ID ] * len(res["input_ids"])
else:
# this should be the assistent response, should end with an eos token
res = self._tokenize(part.strip(), add_eos_token=True, strip_bos_token=True)
# not masked out from labels
labels = copy.deepcopy(res["input_ids"])
input_ids = res["input_ids"]
input_len = len(input_ids)
result["input_ids"][current_len : current_len + input_len] = input_ids

View File

@@ -1,15 +1,34 @@
import copy
import dataclasses
import logging
from enum import auto, Enum
from typing import List, Tuple, Any, Union, Generator
IGNORE_TOKEN_ID = -100
class PromptStyle(Enum):
instruct = "instruct"
chat = "chat"
class AlpacaPrompter:
prompt_input = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
prompt_no_input = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n"
response_split = "### Response:"
system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
prompt_style = None
def __init__(self, prompt_style="instruct"):
self.prompt_style = prompt_style
self.match_prompt_style()
def match_prompt_style(self):
if self.prompt_style == PromptStyle.instruct.value:
self.prompt_input = self.system_prompt + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
self.prompt_no_input = self.system_no_input_prompt + "### Instruction:\n{instruction}\n\n### Response:\n"
self.response_split = "### Response:"
if self.prompt_style == PromptStyle.chat.value:
self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
self.response_split = "ASSISTANT:"
def build_prompt(
self,
@@ -36,7 +55,7 @@ class JeopardyPrompter(AlpacaPrompter):
class MultipleChoiceExplainPrompter(AlpacaPrompter):
prompt_input = "Choose the answer that best answers the question. Explain your reasoning.\n\n### Question:\n{instruction}\n\n### Choices:\n{input}\n\n### Response:\n"
system_prompt = "Choose the answer that best answers the question. Explain your reasoning."
class MultipleChoiceConcisePrompter(AlpacaPrompter):
@@ -64,11 +83,30 @@ class NomicGPT4AllPrompter(AlpacaPrompter):
class ReflectAlpacaPrompter:
prompt_input = "Below is an instruction that describes a task, paired with an input that provides further context. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
prompt_no_input = "Below is an instruction that describes a task. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n### Instruction:\n{instruction}\n\n### Response:\n"
agent_label = "{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n"
system_no_input_prompt = "Below is an instruction that describes a task. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n"
prompt_input = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
prompt_no_input = "### Instruction:\n{instruction}\n\n### Response:\n"
agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
response_split = "### Response:"
def __init__(self, prompt_style="instruct"):
self.prompt_style = prompt_style
self.match_prompt_style()
def match_prompt_style(self):
if self.prompt_style == PromptStyle.instruct.value:
self.prompt_input = self.system_prompt + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
self.prompt_no_input = self.system_no_input_prompt + "### Instruction:\n{instruction}\n\n### Response:\n"
self.agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
self.response_split = "### Final Response:"
if self.prompt_style == PromptStyle.chat.value:
self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
self.agent_label = "\nTHOUGHT: {output}\nASSISTANT REFLECTION: {reflection}\nASSISTANT:"
self.response_split = "ASSISTANT:"
def build_prompt(
self,
instruction: str,
@@ -118,13 +156,13 @@ class Conversation:
def get_prompt(self) -> Generator[str, None, None]:
seps = [self.sep, self.sep2]
preamble = self.system + seps[0]
yield preamble
for i, (role, message) in enumerate(self.messages):
if message:
yield preamble + role + ": " + message + seps[i % 2]
yield (role + ":", " " + message)
else:
yield role + ":"
if i == 0:
preamble = ""
logging.warning("role with empty message: " + role)
yield (role + ":", )
def copy(self):
return Conversation(
@@ -154,7 +192,17 @@ conv_vicuna_v1_1 = Conversation(
class ShareGPTPrompter:
def build_prompt(self, source, tokenizer, sequence_len=2048) -> Generator[str, None, None]:
def __init__(self, prompt_style=None):
if prompt_style != PromptStyle.chat.value:
raise Exception(f"unsupported prompt_style for ShareGPTPrompter({prompt_style})")
# def match_prompt_style(self):
# if self.prompt_style == PromptStyle.chat.value:
# self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
# self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
# self.response_split = "ASSISTANT:"
def build_prompt(self, source, *args, **kwargs) -> Generator[str, None, None]:
# ignore the system prompt if provided
if source[0]["from"] == "system":
source.pop(0)

View File

@@ -50,8 +50,16 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
if cfg.dataset_prepared_path
else Path(default_dataset_prepared_path) / ds_hash
)
dataset = None
try:
if cfg.push_dataset_to_hub:
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
except:
pass
if any(prepared_ds_path.glob("*")):
if dataset:
...
elif any(prepared_ds_path.glob("*")):
logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
dataset = load_from_disk(str(prepared_ds_path))
logging.info("Prepared dataset loaded from disk...")
@@ -85,68 +93,71 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
ds = load_dataset("json", data_files=fp, streaming=True, split=None)
if not ds:
raise Exception("unhandled dataset load")
if d.type == "alpaca":
d_type = d.type
d_type_split = d.type.split(":")
d_base_type = d_type_split[0]
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
if d_base_type == "alpaca":
ds_strategy = AlpacaPromptTokenizingStrategy(
AlpacaPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
AlpacaPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "explainchoice":
elif d_base_type == "explainchoice":
ds_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
MultipleChoiceExplainPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
MultipleChoiceExplainPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "concisechoice":
elif d_base_type == "concisechoice":
ds_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
MultipleChoiceConcisePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
MultipleChoiceConcisePrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "summarizetldr":
elif d_base_type == "summarizetldr":
ds_strategy = SummarizeTLDRPromptTokenizingStrategy(
SummarizeTLDRPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
SummarizeTLDRPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "jeopardy":
elif d_base_type == "jeopardy":
ds_strategy = JeopardyPromptTokenizingStrategy(
JeopardyPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
JeopardyPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "oasst":
elif d_base_type == "oasst":
ds_strategy = OpenAssistantPromptTokenizingStrategy(
AlpacaPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
AlpacaPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "gpteacher":
elif d_base_type == "gpteacher":
ds_strategy = GPTeacherPromptTokenizingStrategy(
GPTeacherPrompter(),
GPTeacherPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "reflection":
elif d_base_type == "reflection":
ds_strategy = AlpacaReflectionPTStrategy(
ReflectAlpacaPrompter(),
ReflectAlpacaPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "sharegpt":
elif d_base_type == "sharegpt":
ds_strategy = ShareGPTPromptTokenizingStrategy(
ShareGPTPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
ShareGPTPrompter(d_prompt_style), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d.type == "completion":
elif d_base_type == "completion":
ds_strategy = CompletionPromptTokenizingStrategy(
CompletionPrompter(),
tokenizer,
@@ -168,6 +179,11 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
f"Saving merged prepared dataset to disk... {prepared_ds_path}"
)
dataset.save_to_disk(prepared_ds_path)
if cfg.push_dataset_to_hub:
logging.info(
f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset.push_to_hub(f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True)
return dataset
@@ -182,13 +198,14 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
if cfg.max_packed_sequence_len is not None:
# see if we can go ahead and load the stacked dataset
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
ds_hash = str(
md5(
(
str(cfg.sequence_len)
+ "@"
+ str(max_packed_sequence_len)
+ seed
+ "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets]))
).encode("utf-8")
).hexdigest()
@@ -199,7 +216,19 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
else Path(default_dataset_prepared_path) / ds_hash
)
if any(prepared_ds_path.glob("*")):
dataset = None
try:
if cfg.push_dataset_to_hub:
logging.info(
f"checkking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
except:
pass
if dataset:
...
elif any(prepared_ds_path.glob("*")):
logging.info(
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
)
@@ -210,6 +239,9 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
tokenizer, cfg, default_dataset_prepared_path
)
if cfg.seed:
dataset = dataset.shuffle(seed=cfg.seed)
constant_len_dataset = ConstantLengthDataset(
tokenizer,
[dataset],
@@ -237,6 +269,11 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
)
dataset.save_to_disk(prepared_ds_path)
if cfg.push_dataset_to_hub:
logging.info(
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset.push_to_hub(f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True)
else:
dataset = load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path

View File

@@ -126,6 +126,32 @@ def load_model(
torch_dtype=torch_dtype,
device_map=cfg.device_map,
)
# elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
# This is a WIP, still an issue with the backward pass
# RuntimeError: grad can be implicitly created only for scalar outputs
# TODO: try config.sequence_parallel = False
# # https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/tests/models/test_gpt_neox.py#L12
# # https://github.com/HazyResearch/flash-attention/tree/main/training#model-components
# # add `**kwargs` to https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/flash_attn/models/gpt.py#L442
# from flash_attn.utils.pretrained import state_dict_from_pretrained
# from flash_attn.models.gpt import GPTLMHeadModel
# from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config
# from transformers import GPTNeoXConfig
# config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(base_model))
# config.use_flash_attn = True
# config.fused_bias_fc = True
# config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast"
# config.activation_function = "gelu_fast"
# config.fused_dropout_add_ln = True
# # config.residual_in_fp32 = True
#
# model: GPTLMHeadModel = GPTLMHeadModel.from_pretrained(
# base_model,
# config,
# dtype=torch_dtype,
# device=cfg.device,
# )
# model.train() # sets to train instead of eval mode
elif model_type:
model = getattr(transformers, model_type).from_pretrained(
base_model,
@@ -266,7 +292,7 @@ def load_llama_adapter(model, cfg):
task_type="CAUSAL_LM",
)
if cfg.peft_model_dir:
if cfg.lora_model_dir:
model = PeftModel.from_pretrained(
model,
cfg.lora_model_dir,
@@ -307,7 +333,7 @@ def load_lora(model, cfg):
model,
cfg.lora_model_dir,
device_map=cfg.device_map,
torch_dtype=torch.float16,
# torch_dtype=torch.float16,
)
else:
model = get_peft_model(model, lora_config)