Compare commits
1 Commits
compute-pe
...
flan-no-bo
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e91fed495a |
2
.github/workflows/base.yml
vendored
2
.github/workflows/base.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
pytorch: 2.0.0
|
||||
axolotl_extras:
|
||||
- cuda: "117"
|
||||
cuda_version: 11.7.1
|
||||
cuda_version: 11.7.0
|
||||
python_version: "3.9"
|
||||
pytorch: 1.13.1
|
||||
axolotl_extras:
|
||||
|
||||
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -30,7 +30,7 @@ jobs:
|
||||
pytorch: 2.0.0
|
||||
axolotl_extras: gptq
|
||||
- cuda: cu117
|
||||
cuda_version: 11.7.1
|
||||
cuda_version: 11.7.0
|
||||
python_version: "3.9"
|
||||
pytorch: 1.13.1
|
||||
axolotl_extras:
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
pytorch: 2.0.0
|
||||
axolotl_extras: gptq
|
||||
- cuda: cu117
|
||||
cuda_version: 11.7.1
|
||||
cuda_version: 11.7.0
|
||||
python_version: "3.9"
|
||||
pytorch: 1.13.1
|
||||
axolotl_extras:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
default_language_version:
|
||||
python: python3
|
||||
python: python3.9
|
||||
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
|
||||
26
README.md
26
README.md
@@ -195,10 +195,6 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
||||
```json
|
||||
{"message_1": "...", "message_2": "..."}
|
||||
```
|
||||
- `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
|
||||
```json
|
||||
{"system_prompt": "...", "question": "...", "response": "..."}
|
||||
```
|
||||
- `context_qa`: in context question answering from an article
|
||||
```json
|
||||
{"article": "...", "question": "...", "answer": "..."}
|
||||
@@ -237,7 +233,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
||||
#### How to add custom prompts
|
||||
|
||||
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
||||
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
|
||||
2. Use your custom file name as the dataset type.
|
||||
|
||||
Optionally, download some datasets, see [data/README.md](data/README.md)
|
||||
|
||||
@@ -255,18 +251,10 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
||||
|
||||
- dataset
|
||||
```yaml
|
||||
sequence_len: 2048 # max token length for prompt
|
||||
|
||||
# huggingface repo
|
||||
datasets:
|
||||
- path: vicgalle/alpaca-gpt4
|
||||
type: alpaca # format from earlier
|
||||
|
||||
# local
|
||||
datasets:
|
||||
- path: json
|
||||
data_files: data.jsonl # or json
|
||||
- path: vicgalle/alpaca-gpt4 # local or huggingface repo
|
||||
type: alpaca # format from earlier
|
||||
sequence_len: 2048 # max token length / prompt
|
||||
```
|
||||
|
||||
- loading
|
||||
@@ -314,8 +302,6 @@ model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Trust remote code for untrusted source
|
||||
trust_remote_code:
|
||||
# use_fast option for tokenizer loading from_pretrained, default to True
|
||||
tokenizer_use_fast:
|
||||
|
||||
# whether you are training a 4-bit GPTQ quantized model
|
||||
gptq: true
|
||||
@@ -336,10 +322,10 @@ tf32: true # require >=ampere
|
||||
|
||||
# a list of one or more datasets to finetune the model with
|
||||
datasets:
|
||||
# hf dataset repo | "json" for local dataset, make sure to fill data_files
|
||||
# this can be either a hf dataset, or relative path
|
||||
- path: vicgalle/alpaca-gpt4
|
||||
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||
type: alpaca # format OR format:prompt_style (chat/instruct)
|
||||
data_files: # path to source data files
|
||||
shards: # number of shards to split data into
|
||||
|
||||
@@ -348,8 +334,6 @@ datasets:
|
||||
dataset_prepared_path: data/last_run_prepared
|
||||
# push prepared dataset to hub
|
||||
push_dataset_to_hub: # repo path
|
||||
# push checkpoints to hub
|
||||
hub_model_id: # repo path
|
||||
# whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
|
||||
# required to be true when used in combination with `push_dataset_to_hub`
|
||||
hf_use_auth_token: # boolean
|
||||
|
||||
@@ -77,7 +77,7 @@ FROM base-builder
|
||||
RUN python3 -m pip uninstall -y apex
|
||||
RUN git clone https://github.com/NVIDIA/apex
|
||||
# `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
|
||||
RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||||
RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
|
||||
|
||||
RUN mkdir -p /workspace/builds
|
||||
COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
|
||||
@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
|
||||
RUN git lfs install --skip-repo
|
||||
RUN pip3 install awscli && \
|
||||
# The base image ships with `pydantic==1.8.2` which is not working
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||
pip3 install -U --no-cache-dir pydantic
|
||||
|
||||
@@ -126,7 +126,6 @@ class ConstantLengthDataset(IterableDataset):
|
||||
buffer_len = 0
|
||||
|
||||
if example:
|
||||
# FIXME
|
||||
# just going to drop data points that are too long
|
||||
if len(example["input_ids"]) <= self.seq_length:
|
||||
input_ids = example["input_ids"]
|
||||
|
||||
@@ -6,7 +6,7 @@ from axolotl.prompt_tokenizers import (
|
||||
AlpacaPromptTokenizingStrategy,
|
||||
InstructionPromptTokenizingStrategy,
|
||||
)
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle
|
||||
|
||||
|
||||
def load(tokenizer, cfg):
|
||||
@@ -45,10 +45,8 @@ class NoSystemPrompter(AlpacaPrompter):
|
||||
Null Prompter with no system prompts
|
||||
"""
|
||||
|
||||
system_prompt = ""
|
||||
system_no_input_prompt = ""
|
||||
turn_format = "{instruction} {input} "
|
||||
turn_no_input_format = "{instruction} "
|
||||
prompt_input = "{instruction} {input} "
|
||||
prompt_no_input = "{instruction} "
|
||||
|
||||
def __init__(self): # pylint: disable=super-init-not-called
|
||||
pass
|
||||
@@ -105,12 +103,3 @@ def load_camel_ai(tokenizer, cfg):
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
)
|
||||
|
||||
|
||||
def load_no_prompt(tokenizer, cfg):
|
||||
return AlpacaPromptTokenizingStrategy(
|
||||
UnpromptedPrompter(PromptStyle.CHAT.value),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
"""Module loading the AlpacaInstructPromptTokenizingStrategy class"""
|
||||
|
||||
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle
|
||||
|
||||
|
||||
def load(tokenizer, cfg):
|
||||
@@ -11,12 +11,3 @@ def load(tokenizer, cfg):
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
)
|
||||
|
||||
|
||||
def load_no_prompt(tokenizer, cfg):
|
||||
return AlpacaPromptTokenizingStrategy(
|
||||
UnpromptedPrompter(PromptStyle.INSTRUCT.value),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
)
|
||||
|
||||
@@ -1,120 +0,0 @@
|
||||
"""
|
||||
Prompt strategies loader for alpaca instruction datasets with system prompts
|
||||
"""
|
||||
from typing import Generator, Tuple, Union
|
||||
|
||||
from axolotl.prompt_tokenizers import PromptTokenizingStrategy
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle
|
||||
|
||||
|
||||
class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
"""
|
||||
Tokenizing strategy for instruction-based prompts.
|
||||
"""
|
||||
|
||||
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
|
||||
return (
|
||||
prompt["instruction"],
|
||||
prompt["input"] if "input" in prompt else "",
|
||||
prompt["output"],
|
||||
prompt["system"],
|
||||
)
|
||||
|
||||
def tokenize_prompt(self, prompt):
|
||||
# pylint: disable=duplicate-code
|
||||
(
|
||||
instruction,
|
||||
input, # pylint: disable=redefined-builtin
|
||||
response,
|
||||
system,
|
||||
) = self.parse_instruction_fields(prompt)
|
||||
user_prompt = next(
|
||||
iter(
|
||||
self.prompter.build_prompt_w_system(
|
||||
system,
|
||||
instruction,
|
||||
input,
|
||||
)
|
||||
)
|
||||
)
|
||||
tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
|
||||
if not self.train_on_inputs:
|
||||
user_prompt_len = len(tokenized_prompt["input_ids"])
|
||||
# TODO this could be sped up using numpy array slicing
|
||||
tokenized_prompt["labels"] = [-100] * user_prompt_len
|
||||
tokenized_res_prompt = self._tokenize(
|
||||
response, strip_bos_token=True, add_eos_token=True
|
||||
)
|
||||
tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
|
||||
tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
|
||||
tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]
|
||||
|
||||
return tokenized_prompt
|
||||
|
||||
|
||||
class SystemDataPrompter(AlpacaPrompter):
|
||||
"""
|
||||
Alpaca Style Prompter that uses system prompts from the dataset
|
||||
"""
|
||||
|
||||
def build_prompt_w_system(
|
||||
self,
|
||||
system: str,
|
||||
instruction: str,
|
||||
input: Union[None, str] = None, # pylint: disable=redefined-builtin
|
||||
output: Union[None, str] = None,
|
||||
) -> Generator[str, None, None]:
|
||||
# returns the full prompt from instruction and optional input
|
||||
# if a label (=response, =output) is provided, it's also appended.
|
||||
if input:
|
||||
res = system + self.turn_format.format(instruction=instruction, input=input)
|
||||
else:
|
||||
res = system + self.turn_no_input_format.format(instruction=instruction)
|
||||
if output:
|
||||
res = f"{res}{output}"
|
||||
yield res
|
||||
|
||||
|
||||
class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
|
||||
"""
|
||||
Tokenizing strategy for OpenOrca datasets
|
||||
"""
|
||||
|
||||
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
|
||||
return (
|
||||
prompt["question"],
|
||||
"",
|
||||
prompt["response"],
|
||||
prompt["system_prompt"],
|
||||
)
|
||||
|
||||
|
||||
def load(tokenizer, cfg):
|
||||
return load_chat(tokenizer, cfg)
|
||||
|
||||
|
||||
def load_instruct(tokenizer, cfg):
|
||||
return InstructionWSystemPromptTokenizingStrategy(
|
||||
SystemDataPrompter(PromptStyle.INSTRUCT.value),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
)
|
||||
|
||||
|
||||
def load_chat(tokenizer, cfg):
|
||||
return InstructionWSystemPromptTokenizingStrategy(
|
||||
SystemDataPrompter(PromptStyle.CHAT.value),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
)
|
||||
|
||||
|
||||
def load_open_orca(tokenizer, cfg):
|
||||
return OpenOrcaPromptTokenizingStrategy(
|
||||
SystemDataPrompter(PromptStyle.INSTRUCT.value),
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
)
|
||||
@@ -73,8 +73,17 @@ class PromptTokenizingStrategy(abc.ABC):
|
||||
):
|
||||
result["input_ids"].append(self.tokenizer.eos_token_id)
|
||||
result["attention_mask"].append(1)
|
||||
elif ( # some tokenizers automatically add an eos token, let's remove it
|
||||
not add_eos_token and result["input_ids"][-1] == self.tokenizer.eos_token_id
|
||||
):
|
||||
result["input_ids"] = result["input_ids"][:-1]
|
||||
result["attention_mask"] = result["attention_mask"][:-1]
|
||||
|
||||
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
|
||||
if (
|
||||
self.tokenizer.bos_token_id
|
||||
and result["input_ids"][0] == self.tokenizer.bos_token_id
|
||||
and strip_bos_token
|
||||
):
|
||||
result["input_ids"] = result["input_ids"][1:]
|
||||
result["attention_mask"] = result["attention_mask"][1:]
|
||||
|
||||
@@ -87,9 +96,7 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
Tokenizing strategy for instruction-based prompts.
|
||||
"""
|
||||
|
||||
def parse_instruction_fields(
|
||||
self, prompt
|
||||
) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
|
||||
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||
raise NotImplementedError
|
||||
|
||||
def tokenize_prompt(self, prompt):
|
||||
@@ -414,7 +421,11 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
result["input_ids"].append(self.tokenizer.eos_token_id)
|
||||
result["attention_mask"].append(1)
|
||||
|
||||
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
|
||||
if (
|
||||
self.tokenizer.bos_token_id
|
||||
and result["input_ids"][0] == self.tokenizer.bos_token_id
|
||||
and strip_bos_token
|
||||
):
|
||||
result["input_ids"] = result["input_ids"][1:]
|
||||
result["attention_mask"] = result["attention_mask"][1:]
|
||||
|
||||
@@ -440,7 +451,7 @@ def parse_tokenized_to_result(
|
||||
result: Dict[str, List[int]],
|
||||
current_len: int,
|
||||
res: Dict[str, List[int]],
|
||||
labels: List[int],
|
||||
labels: list[int],
|
||||
pad_token_id: Union[int, None] = None,
|
||||
) -> Tuple[Dict[str, List[int]], int]:
|
||||
"""
|
||||
|
||||
@@ -24,8 +24,6 @@ class AlpacaPrompter:
|
||||
|
||||
system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
|
||||
system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
|
||||
turn_format: str
|
||||
turn_no_input_format: str
|
||||
prompt_style: Optional[PromptStyle] = None
|
||||
|
||||
def __init__(self, prompt_style=PromptStyle.INSTRUCT.value):
|
||||
@@ -34,13 +32,23 @@ class AlpacaPrompter:
|
||||
|
||||
def match_prompt_style(self):
|
||||
if self.prompt_style == PromptStyle.INSTRUCT.value:
|
||||
self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
|
||||
self.turn_no_input_format = (
|
||||
"### Instruction:\n{instruction}\n\n### Response:\n"
|
||||
self.prompt_input = (
|
||||
self.system_prompt
|
||||
+ "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
|
||||
)
|
||||
self.prompt_no_input = (
|
||||
self.system_no_input_prompt
|
||||
+ "### Instruction:\n{instruction}\n\n### Response:\n"
|
||||
)
|
||||
self.response_split = "### Response:"
|
||||
if self.prompt_style == PromptStyle.CHAT.value:
|
||||
self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
|
||||
self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
|
||||
self.prompt_input = (
|
||||
self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
|
||||
)
|
||||
self.prompt_no_input = (
|
||||
self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
|
||||
)
|
||||
self.response_split = "ASSISTANT:"
|
||||
|
||||
def build_prompt(
|
||||
self,
|
||||
@@ -51,17 +59,16 @@ class AlpacaPrompter:
|
||||
# returns the full prompt from instruction and optional input
|
||||
# if a label (=response, =output) is provided, it's also appended.
|
||||
if input:
|
||||
res = self.system_prompt + self.turn_format.format(
|
||||
instruction=instruction, input=input
|
||||
)
|
||||
res = self.prompt_input.format(instruction=instruction, input=input)
|
||||
else:
|
||||
res = self.system_no_input_prompt + self.turn_no_input_format.format(
|
||||
instruction=instruction
|
||||
)
|
||||
res = self.prompt_no_input.format(instruction=instruction)
|
||||
if output:
|
||||
res = f"{res}{output}"
|
||||
yield res
|
||||
|
||||
def get_response(self, output: str) -> str:
|
||||
return output.split(self.response_split)[1].strip()
|
||||
|
||||
|
||||
class UnpromptedPrompter(AlpacaPrompter):
|
||||
"""
|
||||
@@ -86,10 +93,7 @@ class MultipleChoiceExplainPrompter(AlpacaPrompter):
|
||||
"""
|
||||
|
||||
system_prompt = (
|
||||
"Choose the answer that best answers the question. Explain your reasoning.\n"
|
||||
)
|
||||
system_no_input_prompt = (
|
||||
"Choose the answer that best answers the question. Explain your reasoning.\n"
|
||||
"Choose the answer that best answers the question. Explain your reasoning."
|
||||
)
|
||||
|
||||
|
||||
@@ -98,12 +102,7 @@ class MultipleChoiceConcisePrompter(AlpacaPrompter):
|
||||
Prompter for multiple choice concise
|
||||
"""
|
||||
|
||||
system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
|
||||
system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
|
||||
|
||||
def match_prompt_style(self):
|
||||
self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
|
||||
self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
|
||||
prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"
|
||||
|
||||
|
||||
class SummarizeTLDRPrompter(AlpacaPrompter):
|
||||
@@ -111,12 +110,9 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
|
||||
Prompter for summarize TLDR
|
||||
"""
|
||||
|
||||
system_prompt = ""
|
||||
system_no_input_prompt = ""
|
||||
|
||||
def match_prompt_style(self):
|
||||
self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
|
||||
self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
|
||||
prompt_no_input = (
|
||||
"USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
|
||||
)
|
||||
|
||||
|
||||
class CompletionPrompter:
|
||||
@@ -132,6 +128,9 @@ class CompletionPrompter:
|
||||
) -> Generator[str, None, None]:
|
||||
yield instruction
|
||||
|
||||
def get_response(self, output: str) -> str:
|
||||
return output.strip()
|
||||
|
||||
|
||||
class GPTeacherPrompter(AlpacaPrompter):
|
||||
"""
|
||||
@@ -211,6 +210,9 @@ class ReflectAlpacaPrompter:
|
||||
res = f"{res}{label}"
|
||||
yield res
|
||||
|
||||
def get_response(self, output: str) -> str:
|
||||
return output.split(self.response_split)[1].strip()
|
||||
|
||||
|
||||
class SeparatorStyle(Enum):
|
||||
"""Different separator style."""
|
||||
@@ -287,6 +289,12 @@ class ShareGPTPrompter: # pylint: disable=too-few-public-methods
|
||||
sep2=" ",
|
||||
)
|
||||
|
||||
# def match_prompt_style(self):
|
||||
# if self.prompt_style == PromptStyle.chat.value:
|
||||
# self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
|
||||
# self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
|
||||
# self.response_split = "ASSISTANT:"
|
||||
|
||||
def build_prompt(self, source) -> Generator[str, None, None]:
|
||||
# ignore the system prompt if provided
|
||||
if source[0]["from"] == "system":
|
||||
|
||||
@@ -102,26 +102,13 @@ def load_tokenized_prepared_datasets(
|
||||
pass
|
||||
|
||||
# prefer local dataset, even if hub exists
|
||||
local_path = Path(d.path)
|
||||
if local_path.exists():
|
||||
if local_path.is_dir():
|
||||
ds = load_dataset(
|
||||
d.path,
|
||||
data_files=d.data_files,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
elif local_path.is_file():
|
||||
ds = load_dataset(
|
||||
"json",
|
||||
data_files=d.path,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
||||
)
|
||||
if Path(d.path).exists():
|
||||
ds = load_dataset(
|
||||
"json",
|
||||
data_files=d.path,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
elif ds_from_hub:
|
||||
if d.data_files:
|
||||
ds = load_dataset(
|
||||
|
||||
@@ -34,20 +34,15 @@ def load_tokenizer(
|
||||
tokenizer_type,
|
||||
cfg,
|
||||
):
|
||||
use_fast = True # this is the default
|
||||
if cfg.tokenizer_use_fast is not None:
|
||||
use_fast = cfg.tokenizer_use_fast
|
||||
if tokenizer_type:
|
||||
tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
|
||||
tokenizer_config,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
use_fast=use_fast,
|
||||
)
|
||||
else:
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
tokenizer_config,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
use_fast=use_fast,
|
||||
)
|
||||
|
||||
logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
||||
@@ -202,7 +197,7 @@ def load_model(
|
||||
else True,
|
||||
)
|
||||
load_in_8bit = False
|
||||
elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
|
||||
elif cfg.is_llama_derived_model:
|
||||
from transformers import LlamaForCausalLM
|
||||
|
||||
config = LlamaConfig.from_pretrained(base_model_config)
|
||||
@@ -241,7 +236,7 @@ def load_model(
|
||||
# device=cfg.device,
|
||||
# )
|
||||
# model.train() # sets to train instead of eval mode
|
||||
elif model_type and not cfg.trust_remote_code:
|
||||
elif model_type:
|
||||
model = getattr(transformers, model_type).from_pretrained(
|
||||
base_model,
|
||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
"""Module for custom LRScheduler class"""
|
||||
import math
|
||||
from functools import partial
|
||||
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.lr_scheduler import LambdaLR, LRScheduler
|
||||
from torch.optim.lr_scheduler import LRScheduler
|
||||
|
||||
|
||||
class InterpolatingLogScheduler(LRScheduler):
|
||||
@@ -45,58 +42,3 @@ class InterpolatingLogScheduler(LRScheduler):
|
||||
lrs = [self.max_lr for base_lr in self.base_lrs]
|
||||
|
||||
return lrs
|
||||
|
||||
|
||||
def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
|
||||
current_step: int,
|
||||
*,
|
||||
num_warmup_steps: int,
|
||||
num_training_steps: int,
|
||||
num_cycles: float
|
||||
):
|
||||
if current_step < num_warmup_steps:
|
||||
return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
|
||||
progress = float(current_step - num_warmup_steps) / float(
|
||||
max(1, num_training_steps - num_warmup_steps)
|
||||
)
|
||||
return max(
|
||||
0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
|
||||
)
|
||||
|
||||
|
||||
def get_cosine_schedule_with_quadratic_warmup(
|
||||
optimizer: Optimizer,
|
||||
num_warmup_steps: int,
|
||||
num_training_steps: int,
|
||||
num_cycles: float = 0.5,
|
||||
last_epoch: int = -1,
|
||||
):
|
||||
"""
|
||||
Create a schedule with a learning rate that decreases following the values of the cosine function between the
|
||||
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
|
||||
initial lr set in the optimizer.
|
||||
|
||||
Args:
|
||||
optimizer ([`~torch.optim.Optimizer`]):
|
||||
The optimizer for which to schedule the learning rate.
|
||||
num_warmup_steps (`int`):
|
||||
The number of steps for the warmup phase.
|
||||
num_training_steps (`int`):
|
||||
The total number of training steps.
|
||||
num_cycles (`float`, *optional*, defaults to 0.5):
|
||||
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
|
||||
following a half-cosine).
|
||||
last_epoch (`int`, *optional*, defaults to -1):
|
||||
The index of the last epoch when resuming training.
|
||||
|
||||
Return:
|
||||
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
|
||||
"""
|
||||
|
||||
lr_lambda = partial(
|
||||
_get_cosine_schedule_with_quadratic_warmup_lr_lambda,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
num_training_steps=num_training_steps,
|
||||
num_cycles=num_cycles,
|
||||
)
|
||||
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||
|
||||
@@ -34,5 +34,3 @@ def check_example_labels(example, tokenizer):
|
||||
|
||||
logging.info(" ".join(colored_tokens))
|
||||
logging.info("\n\n\n")
|
||||
|
||||
return " ".join(colored_tokens)
|
||||
|
||||
@@ -5,82 +5,25 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Optional
|
||||
|
||||
import bitsandbytes as bnb
|
||||
import torch.cuda
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from torch import nn
|
||||
from torch.optim.lr_scheduler import OneCycleLR
|
||||
from transformers import (
|
||||
EarlyStoppingCallback,
|
||||
EvalPrediction,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
)
|
||||
from transformers import EarlyStoppingCallback, Trainer
|
||||
from transformers.trainer_pt_utils import get_parameter_names
|
||||
|
||||
from axolotl.utils.callbacks import (
|
||||
SaveBetterTransformerModelCallback,
|
||||
SavePeftModelCallback,
|
||||
)
|
||||
from axolotl.utils.schedulers import (
|
||||
InterpolatingLogScheduler,
|
||||
get_cosine_schedule_with_quadratic_warmup,
|
||||
)
|
||||
from axolotl.utils.schedulers import InterpolatingLogScheduler
|
||||
|
||||
|
||||
class AxolotlTrainingArguments(TrainingArguments):
|
||||
"""
|
||||
Extend the base TrainingArguments for axolotl helpers
|
||||
"""
|
||||
|
||||
lr_quadratic_warmup: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Use quadratic warmup for cosine scheduling."},
|
||||
)
|
||||
|
||||
|
||||
class AxolotlTrainer(Trainer):
|
||||
"""
|
||||
Extend the base Trainer for axolotl helpers
|
||||
"""
|
||||
|
||||
args = None # type: AxolotlTrainingArguments
|
||||
|
||||
def create_scheduler(
|
||||
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
||||
):
|
||||
"""
|
||||
Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
|
||||
passed as an argument.
|
||||
|
||||
Args:
|
||||
num_training_steps (int): The number of training steps to do.
|
||||
optimizer (torch.optim.Optimizer): The training optimizer
|
||||
"""
|
||||
|
||||
# fmt: off
|
||||
if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
|
||||
# fmt: on
|
||||
if (
|
||||
self.args.lr_scheduler_type == "cosine"
|
||||
and self.args.lr_quadratic_warmup is True
|
||||
):
|
||||
self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
|
||||
optimizer,
|
||||
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
|
||||
num_training_steps=num_training_steps,
|
||||
)
|
||||
else:
|
||||
return super().create_scheduler(num_training_steps, optimizer)
|
||||
return self.lr_scheduler
|
||||
|
||||
|
||||
class OneCycleLRSchedulerTrainer(AxolotlTrainer):
|
||||
class OneCycleLRSchedulerTrainer(Trainer):
|
||||
"""
|
||||
Trainer subclass that uses the OneCycleLR scheduler
|
||||
"""
|
||||
@@ -160,9 +103,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
||||
if cfg.fsdp_config:
|
||||
training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)
|
||||
|
||||
if cfg.lr_quadratic_warmup is not None:
|
||||
training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
|
||||
|
||||
# deepspeed
|
||||
if (
|
||||
os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
|
||||
@@ -184,11 +124,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
||||
if cfg.max_grad_norm:
|
||||
training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
|
||||
|
||||
if cfg.hub_model_id:
|
||||
training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
|
||||
training_arguments_kwargs["push_to_hub"] = True
|
||||
|
||||
training_args = AxolotlTrainingArguments(
|
||||
training_args = transformers.TrainingArguments(
|
||||
per_device_train_batch_size=cfg.micro_batch_size,
|
||||
per_device_eval_batch_size=cfg.eval_batch_size
|
||||
if cfg.eval_batch_size is not None
|
||||
@@ -335,23 +271,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
||||
num_proc=32,
|
||||
)
|
||||
|
||||
if cfg.compute_perplexity_metrics:
|
||||
|
||||
def compute_metrics(eval_preds: EvalPrediction) -> Dict[str, Any]:
|
||||
logits = eval_preds.predictions
|
||||
labels = eval_preds.label_ids
|
||||
cross_entropy_loss = F.cross_entropy(
|
||||
logits.view(-1, model.config.vocab_size), labels.view(-1)
|
||||
)
|
||||
perplexity = torch.exp(cross_entropy_loss)
|
||||
return {"cross_entropy_loss": cross_entropy_loss, "perplexity": perplexity}
|
||||
|
||||
trainer_kwargs["compute_metrics"] = compute_metrics
|
||||
|
||||
trainer_cls = (
|
||||
OneCycleLRSchedulerTrainer
|
||||
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
|
||||
else AxolotlTrainer
|
||||
else transformers.Trainer
|
||||
)
|
||||
trainer = trainer_cls(
|
||||
model=model,
|
||||
|
||||
@@ -87,16 +87,11 @@ def validate_config(cfg):
|
||||
"You probably want to disable group_by_length as it will force a streamed dataset to download completely."
|
||||
)
|
||||
|
||||
if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
|
||||
if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
|
||||
not cfg.optimizer or "adamw" not in cfg.optimizer
|
||||
):
|
||||
logging.warning("adamw hyperparameters found, but no adamw optimizer set")
|
||||
|
||||
if cfg.push_to_hub_model_id:
|
||||
raise ValueError(
|
||||
"push_to_hub_model_id is deprecated. Please use hub_model_id instead."
|
||||
)
|
||||
|
||||
# TODO
|
||||
# MPT 7b
|
||||
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
||||
|
||||
@@ -7,15 +7,11 @@ from pathlib import Path
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
|
||||
from axolotl.prompt_strategies.alpaca_w_system import (
|
||||
InstructionWSystemPromptTokenizingStrategy,
|
||||
SystemDataPrompter,
|
||||
)
|
||||
from axolotl.prompt_tokenizers import (
|
||||
AlpacaPromptTokenizingStrategy,
|
||||
ShareGPTPromptTokenizingStrategy,
|
||||
)
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle, ShareGPTPrompter
|
||||
from axolotl.prompters import AlpacaPrompter, ShareGPTPrompter
|
||||
|
||||
logging.basicConfig(level="INFO")
|
||||
|
||||
@@ -100,39 +96,5 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
|
||||
assert example["labels"][world_idx - 1] == -100
|
||||
|
||||
|
||||
class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
|
||||
"""
|
||||
Test class for prompt tokenization strategies with sys prompt from the dataset
|
||||
"""
|
||||
|
||||
def setUp(self) -> None:
|
||||
# pylint: disable=duplicate-code
|
||||
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
||||
self.tokenizer.add_special_tokens(
|
||||
{
|
||||
"bos_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
"unk_token": "<unk>",
|
||||
}
|
||||
)
|
||||
|
||||
def test_system_alpaca(self):
|
||||
prompter = SystemDataPrompter(PromptStyle.CHAT.value)
|
||||
strat = InstructionWSystemPromptTokenizingStrategy(
|
||||
prompter,
|
||||
self.tokenizer,
|
||||
False,
|
||||
2048,
|
||||
)
|
||||
sample = {
|
||||
"system": "use cot",
|
||||
"instruction": "hello!",
|
||||
"output": "Hi! How can I help?",
|
||||
}
|
||||
example = strat.tokenize_prompt(sample)
|
||||
assert example["input_ids"][0:3] == [1, 671, 20118] # <s>use cot
|
||||
assert example["input_ids"][3] == 11889 # USER
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -2,13 +2,7 @@
|
||||
|
||||
import unittest
|
||||
|
||||
from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
|
||||
from axolotl.prompters import (
|
||||
AlpacaPrompter,
|
||||
MultipleChoiceExplainPrompter,
|
||||
PromptStyle,
|
||||
UnpromptedPrompter,
|
||||
)
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle
|
||||
|
||||
|
||||
class AlpacaPrompterTest(unittest.TestCase):
|
||||
@@ -61,64 +55,3 @@ class AlpacaPrompterTest(unittest.TestCase):
|
||||
assert "### Response:" not in res
|
||||
assert "USER:" in res
|
||||
assert "ASSISTANT:" in res
|
||||
|
||||
def test_system_prompt(self):
|
||||
prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
|
||||
res = next(
|
||||
prompter.build_prompt_w_system(
|
||||
"use cot", "tell me a joke about the following", "alpacas"
|
||||
)
|
||||
)
|
||||
assert "use cot" in res
|
||||
assert res.startswith("use cot")
|
||||
assert "### Instruction:" not in res
|
||||
assert "### Input:" not in res
|
||||
assert "alpacas" in res
|
||||
assert "### Response:" not in res
|
||||
assert "USER:" in res
|
||||
assert "ASSISTANT:" in res
|
||||
|
||||
|
||||
class UnpromptedPrompterTest(unittest.TestCase):
|
||||
"""
|
||||
Test class for UnpromptedPrompter with no system prompts
|
||||
"""
|
||||
|
||||
def test_prompt_style_w_none(self):
|
||||
prompter = UnpromptedPrompter(prompt_style=None)
|
||||
res = next(prompter.build_prompt("tell me a joke"))
|
||||
assert "### Instruction:" in res
|
||||
assert "tell me a joke" in res
|
||||
assert res.startswith("###")
|
||||
|
||||
def test_prompt_style_w_instruct(self):
|
||||
prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
|
||||
res = next(
|
||||
prompter.build_prompt("tell me a joke about the following", "alpacas")
|
||||
)
|
||||
assert "### Instruction:" in res
|
||||
assert "tell me a joke" in res
|
||||
assert res.startswith("###")
|
||||
|
||||
def test_prompt_style_w_chat(self):
|
||||
prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
|
||||
res = next(
|
||||
prompter.build_prompt("tell me a joke about the following", "alpacas")
|
||||
)
|
||||
assert "USER:" in res
|
||||
assert "tell me a joke" in res
|
||||
assert res.startswith("USER:")
|
||||
|
||||
|
||||
class MultipleChoiceExplainPrompterTest(unittest.TestCase):
|
||||
"""
|
||||
Test class for MultipleChoiceExplainPrompter
|
||||
"""
|
||||
|
||||
def test_prompt_style_w_chat(self):
|
||||
prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
|
||||
res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
|
||||
assert "USER:" in res
|
||||
assert "choose one" in res
|
||||
assert "Choose the answer that best answers the question." in res
|
||||
assert "- A\n- B\n- C" in res
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
"""
|
||||
Test cases for the tokenizer loading
|
||||
"""
|
||||
import unittest
|
||||
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.models import load_tokenizer
|
||||
|
||||
|
||||
class TestTokenizers(unittest.TestCase):
|
||||
"""
|
||||
test class for the load_tokenizer fn
|
||||
"""
|
||||
|
||||
def test_default_use_fast(self):
|
||||
cfg = DictDefault({})
|
||||
tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
|
||||
assert "Fast" in tokenizer.__class__.__name__
|
||||
|
||||
def test_dont_use_fast(self):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"tokenizer_use_fast": False,
|
||||
}
|
||||
)
|
||||
tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
|
||||
assert "Fast" not in tokenizer.__class__.__name__
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"optimizer": None,
|
||||
"adam_epsilon": 0.0001,
|
||||
"adamw_epsilon": 0.0001,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"optimizer": "adafactor",
|
||||
"adam_beta1": 0.0001,
|
||||
"adamw_beta1": 0.0001,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"adam_beta1": 0.9,
|
||||
"adam_beta2": 0.99,
|
||||
"adam_epsilon": 0.0001,
|
||||
"adamw_beta1": 0.0001,
|
||||
"adamw_beta2": 0.0001,
|
||||
"adamw_epsilon": 0.0001,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user