Merge pull request #155 from OpenAccess-AI-Collective/misc-fixes
new prompters, misc fixes for output dir missing using fsdp, and changing max seq len
This commit is contained in:
20
README.md
20
README.md
@@ -165,10 +165,30 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
|||||||
```json
|
```json
|
||||||
{"article": "...", "summary": "..."}
|
{"article": "...", "summary": "..."}
|
||||||
```
|
```
|
||||||
|
- `alpaca_chat`: basic instruct for alpaca chat
|
||||||
|
```json
|
||||||
|
{"instruction": "...", "input": "...", "response": "..."}
|
||||||
|
```
|
||||||
- `alpaca_chat.load_qa`: question and answer for alpaca chat
|
- `alpaca_chat.load_qa`: question and answer for alpaca chat
|
||||||
```json
|
```json
|
||||||
{"question": "...", "answer": "..."}
|
{"question": "...", "answer": "..."}
|
||||||
```
|
```
|
||||||
|
- `alpaca_chat.load_concise`: question and answer for alpaca chat, for concise answers
|
||||||
|
```json
|
||||||
|
{"instruction": "...", "input": "...", "response": "..."}
|
||||||
|
```
|
||||||
|
- `alpaca_chat.load_camel_ai`: question and answer for alpaca chat, for load_camel_ai
|
||||||
|
```json
|
||||||
|
{"message_1": "...", "message_2": "..."}
|
||||||
|
```
|
||||||
|
- `context_qa`: in context question answering from an article
|
||||||
|
```json
|
||||||
|
{"article": "...", "question": "...", "answer": "..."}
|
||||||
|
```
|
||||||
|
- `context_qa.load_404`: in context question answering from an article, with default response for no answer from context
|
||||||
|
```json
|
||||||
|
{"article": "...", "unanswerable_question": "..."}
|
||||||
|
```
|
||||||
- `creative_acr.load_answer`: instruction and revision
|
- `creative_acr.load_answer`: instruction and revision
|
||||||
```json
|
```json
|
||||||
{"instruction": "...", "revision": "..."}
|
{"instruction": "...", "revision": "..."}
|
||||||
|
|||||||
@@ -279,6 +279,9 @@ def train(
|
|||||||
logging.info(
|
logging.info(
|
||||||
f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
|
f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not Path(cfg.output_dir).is_dir():
|
||||||
|
os.makedirs(cfg.output_dir, exist_ok=True)
|
||||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||||
|
|
||||||
logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
||||||
|
|||||||
@@ -18,6 +18,15 @@ def load(tokenizer, cfg):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AlpacaConcisePrompter(AlpacaPrompter):
|
||||||
|
"""
|
||||||
|
Alpaca Prompter extending the system prompt to ask for concise answers
|
||||||
|
"""
|
||||||
|
|
||||||
|
system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that concisely and appropriately completes the request.\n\n"
|
||||||
|
system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately and concisely completes the request.\n\n"
|
||||||
|
|
||||||
|
|
||||||
class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||||
"""
|
"""
|
||||||
Tokenizing strategy for AlpacaQA
|
Tokenizing strategy for AlpacaQA
|
||||||
@@ -31,6 +40,28 @@ class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CamelAIPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||||
|
"""
|
||||||
|
Tokenizing strategy for CamelAI datasets
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||||
|
return (
|
||||||
|
prompt["message_1"],
|
||||||
|
"",
|
||||||
|
prompt["message_1"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_concise(tokenizer, cfg):
|
||||||
|
return AlpacaPromptTokenizingStrategy(
|
||||||
|
AlpacaConcisePrompter(PromptStyle.CHAT.value),
|
||||||
|
tokenizer,
|
||||||
|
cfg.train_on_inputs,
|
||||||
|
cfg.sequence_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def load_qa(tokenizer, cfg):
|
def load_qa(tokenizer, cfg):
|
||||||
return AlpacaQAPromptTokenizingStrategy(
|
return AlpacaQAPromptTokenizingStrategy(
|
||||||
AlpacaPrompter(PromptStyle.CHAT.value),
|
AlpacaPrompter(PromptStyle.CHAT.value),
|
||||||
@@ -38,3 +69,12 @@ def load_qa(tokenizer, cfg):
|
|||||||
cfg.train_on_inputs,
|
cfg.train_on_inputs,
|
||||||
cfg.sequence_len,
|
cfg.sequence_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load_camel_ai(tokenizer, cfg):
|
||||||
|
return CamelAIPromptTokenizingStrategy(
|
||||||
|
AlpacaPrompter(PromptStyle.CHAT.value),
|
||||||
|
tokenizer,
|
||||||
|
cfg.train_on_inputs,
|
||||||
|
cfg.sequence_len,
|
||||||
|
)
|
||||||
|
|||||||
67
src/axolotl/prompt_strategies/context_qa.py
Normal file
67
src/axolotl/prompt_strategies/context_qa.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
"""Module containing the classes for Context QA Prompt Tokenization Strategies"""
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
|
||||||
|
from axolotl.prompters import AlpacaPrompter, PromptStyle
|
||||||
|
|
||||||
|
|
||||||
|
# article, unanswerable_question, question, answer
|
||||||
|
def load_404(tokenizer, cfg):
|
||||||
|
return AlpacaMissingInfoContextPromptTokenizingStrategy(
|
||||||
|
AlpacaContextPrompter(PromptStyle.CHAT.value),
|
||||||
|
tokenizer,
|
||||||
|
cfg.train_on_inputs,
|
||||||
|
cfg.sequence_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load(tokenizer, cfg):
|
||||||
|
return AlpacaContextPromptTokenizingStrategy(
|
||||||
|
AlpacaContextPrompter(PromptStyle.CHAT.value),
|
||||||
|
tokenizer,
|
||||||
|
cfg.train_on_inputs,
|
||||||
|
cfg.sequence_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AlpacaContextPrompter(AlpacaPrompter):
|
||||||
|
"""
|
||||||
|
Customized system prompted for concise QA
|
||||||
|
"""
|
||||||
|
|
||||||
|
system_prompt = (
|
||||||
|
"Use the following contextual information to concisely answer the question.\n"
|
||||||
|
)
|
||||||
|
system_no_input_prompt = (
|
||||||
|
"Use the following contextual information to concisely answer the question.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AlpacaContextPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||||
|
"""
|
||||||
|
Tokenization Strategy to combine in-context article with a question and answer
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||||
|
return (
|
||||||
|
prompt["article"] + "\n===\n" + prompt["question"],
|
||||||
|
"",
|
||||||
|
prompt["answer"],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AlpacaMissingInfoContextPromptTokenizingStrategy(
|
||||||
|
InstructionPromptTokenizingStrategy
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Tokenization Strategy to combine in-context article with a question that can't be answered
|
||||||
|
from the context and a default response to that effect
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||||
|
return (
|
||||||
|
prompt["article"] + "\n===\n" + prompt["unanswerable_question"],
|
||||||
|
"",
|
||||||
|
"The context provided does not contain any information about your inquiry. "
|
||||||
|
"Therefore, I'm unable to answer your question based on the given context.",
|
||||||
|
)
|
||||||
@@ -234,6 +234,10 @@ def load_model(
|
|||||||
base_model,
|
base_model,
|
||||||
trust_remote_code=cfg.trust_remote_code or False,
|
trust_remote_code=cfg.trust_remote_code or False,
|
||||||
)
|
)
|
||||||
|
# Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
|
||||||
|
# when training starts
|
||||||
|
if config.max_seq_len and cfg.sequence_len > config.max_seq_len:
|
||||||
|
config.max_seq_len = cfg.sequence_len
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
base_model,
|
base_model,
|
||||||
config=config,
|
config=config,
|
||||||
|
|||||||
Reference in New Issue
Block a user