Merge pull request #39 from OpenAccess-AI-Collective/dev

Dev to main
This commit is contained in:
Wing Lian
2023-05-24 23:03:22 -04:00
committed by GitHub
19 changed files with 986 additions and 165 deletions

View File

@@ -11,6 +11,15 @@ jobs:
if: github.repository_owner == 'OpenAccess-AI-Collective' if: github.repository_owner == 'OpenAccess-AI-Collective'
# this job needs to be run on self-hosted GPU runners... # this job needs to be run on self-hosted GPU runners...
runs-on: self-hosted runs-on: self-hosted
strategy:
matrix:
include:
- cuda: cu118
cuda_version: 11.8.0
pytorch: 2.0.0
- cuda: cu117
cuda_version: 11.7.0
pytorch: 1.13.1
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v3 uses: actions/checkout@v3
@@ -32,7 +41,11 @@ jobs:
context: . context: .
file: ./docker/Dockerfile-base file: ./docker/Dockerfile-base
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }} tags: ${{ steps.metadata.outputs.tags }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max
build-args: |
CUDA_VERSION=${{ matrix.cuda_version }}
CUDA=${{ matrix.cuda }}
PYTORCH_VERSION=${{ matrix.pytorch }}

View File

@@ -10,6 +10,15 @@ jobs:
build-axolotl: build-axolotl:
if: github.repository_owner == 'OpenAccess-AI-Collective' if: github.repository_owner == 'OpenAccess-AI-Collective'
# this job needs to be run on self-hosted GPU runners... # this job needs to be run on self-hosted GPU runners...
strategy:
matrix:
include:
- cuda: cu118
cuda_version: 11.8.0
pytorch: 2.0.0
- cuda: cu117
cuda_version: 11.7.0
pytorch: 1.13.1
runs-on: self-hosted runs-on: self-hosted
steps: steps:
- name: Checkout - name: Checkout
@@ -31,10 +40,10 @@ jobs:
with: with:
context: . context: .
build-args: | build-args: |
BASE_TAG=${{ github.ref_name }}-base BASE_TAG=${{ github.ref_name }}-base-${{ matrix.cuda }}-${{ matrix.pytorch }}
file: ./docker/Dockerfile file: ./docker/Dockerfile
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }} tags: ${{ steps.metadata.outputs.tags }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max
@@ -42,6 +51,15 @@ jobs:
needs: build-axolotl needs: build-axolotl
if: github.repository_owner == 'OpenAccess-AI-Collective' if: github.repository_owner == 'OpenAccess-AI-Collective'
# this job needs to be run on self-hosted GPU runners... # this job needs to be run on self-hosted GPU runners...
strategy:
matrix:
include:
- cuda: cu118
cuda_version: 11.8.0
pytorch: 2.0.0
- cuda: cu117
cuda_version: 11.7.0
pytorch: 1.13.1
runs-on: self-hosted runs-on: self-hosted
steps: steps:
- name: Checkout - name: Checkout
@@ -63,10 +81,10 @@ jobs:
with: with:
context: . context: .
build-args: | build-args: |
BASE_TAG=${{ github.ref_name }} BASE_TAG=${{ github.ref_name }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
file: ./docker/Dockerfile-runpod file: ./docker/Dockerfile-runpod
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }} tags: ${{ steps.metadata.outputs.tags }}-${{ matrix.cuda }}-${{ matrix.pytorch }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
cache-from: type=gha cache-from: type=gha
cache-to: type=gha,mode=max cache-to: type=gha,mode=max

View File

@@ -324,7 +324,7 @@ If you are inferencing a pretrained LORA, pass
--lora_model_dir ./completed-model --lora_model_dir ./completed-model
``` ```
### Merge LORA to base (Dev branch 🔧 ) ### Merge LORA to base
Add below flag to train command above Add below flag to train command above

View File

@@ -1,6 +1,7 @@
ARG CUDA_VERSION="11.8.0" ARG CUDA_VERSION="11.8.0"
ARG CUDNN_VERSION="8" ARG CUDNN_VERSION="8"
ARG UBUNTU_VERSION="22.04" ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder
@@ -39,6 +40,14 @@ ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
RUN git clone https://github.com/HazyResearch/flash-attention.git && \ RUN git clone https://github.com/HazyResearch/flash-attention.git && \
cd flash-attention && \ cd flash-attention && \
python3 setup.py bdist_wheel && \
cd csrc/fused_dense_lib && \
python3 setup.py bdist_wheel && \
cd csrc/xentropy && \
python3 setup.py bdist_wheel && \
cd csrc/rotary && \
python3 setup.py bdist_wheel && \
cd csrc/layer_norm && \
python3 setup.py bdist_wheel python3 setup.py bdist_wheel
FROM base-builder AS deepspeed-builder FROM base-builder AS deepspeed-builder
@@ -60,8 +69,12 @@ RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --g
RUN mkdir /workspace/wheels RUN mkdir /workspace/wheels
COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels COPY --from=deepspeed-builder /workspace/DeepSpeed/dist/deepspeed-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels COPY --from=flash-attn-builder /workspace/flash-attention/dist/flash_attn-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/fused_dense_lib/dist/fused_dense_lib-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/xentropy/dist/xentropy-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/rotary/dist/rotary-*.whl wheels
COPY --from=flash-attn-builder /workspace/flash-attention/csrc/layer_norm/dist/dropout_layer_norm-*.whl wheels
RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl RUN pip3 install wheels/deepspeed-*.whl wheels/flash_attn-*.whl wheels/fused_dense_lib-*.whl wheels/xeontropy-*.whl wheels/rotary-*.whl wheels/dropout_layer_norm-*.whl
RUN git lfs install --skip-repo RUN git lfs install --skip-repo
RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main" \ RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main" \
"accelerate @ git+https://github.com/huggingface/accelerate.git@main" \ "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \

View File

@@ -1,11 +1,14 @@
ARG BASE_TAG=main ARG BASE_TAG=main
FROM winglian/axolotl:$BASE_TAG FROM winglian/axolotl:$BASE_TAG
COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh
RUN apt install --yes --no-install-recommends openssh-server tmux && \ RUN apt install --yes --no-install-recommends openssh-server tmux && \
mkdir -p ~/.ssh && \ mkdir -p ~/.ssh && \
chmod 700 ~/.ssh && \ chmod 700 ~/.ssh && \
printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \ printf "\n[[ -z \"\$TMUX\" ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \
chmod +x /root/runpod-entrypoint.sh
ENTRYPOINT ["/workspace/axolotl/scripts/runpod-entrypoint.sh"] ENTRYPOINT ["/root/runpod-entrypoint.sh"]
CMD ["sleep", "infinity"] CMD ["sleep", "infinity"]

View File

@@ -0,0 +1,55 @@
base_model: replit/replit-code-v1-3b
base_model_config: replit/replit-code-v1-3b
trust_remote_code: true
load_in_8bit: false
datasets:
- path: vicgalle/alpaca-gpt4
type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
adapter: lora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- Wqkv
- mlp_up
- mlp_down
lora_fan_in_fan_out:
wandb_project: lora-replit
wandb_watch:
wandb_run_id:
wandb_log_model:
output_dir: ./lora-replit
batch_size: 8
micro_batch_size: 1
num_epochs: 3
optimizer:
torchdistx_path:
lr_scheduler:
learning_rate: 0.00001
train_on_inputs: false
group_by_length: false
bf16: true
tf32: true
gradient_checkpointing:
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_steps: 20
eval_steps: 50
save_steps:
debug:
deepspeed:
weight_decay: 0
fsdp:
fsdp_config:
#special_tokens:

View File

@@ -1,12 +1,12 @@
peft @ git+https://github.com/huggingface/peft.git peft @ git+https://github.com/huggingface/peft.git
transformers @ git+https://github.com/huggingface/transformers.git transformers @ git+https://github.com/huggingface/transformers.git
bitsandbytes>=0.39.0
attrdict attrdict
fire fire
PyYAML==6.0 PyYAML==6.0
black black
bitsandbytes==0.37.2
datasets datasets
accelerate accelerate>=0.19.0
sentencepiece sentencepiece
wandb wandb
einops einops

View File

@@ -1,7 +1,6 @@
import importlib import importlib
import logging import logging
import os import os
import pathlib
import random import random
import signal import signal
import sys import sys
@@ -10,12 +9,12 @@ from typing import Optional
import fire import fire
import torch import torch
import transformers
import yaml import yaml
from attrdict import AttrDefault from attrdict import AttrDefault
# add src to the pythonpath so we don't need to pip install this # add src to the pythonpath so we don't need to pip install this
from axolotl.utils.tokenization import check_dataset_labels from axolotl.utils.tokenization import check_dataset_labels
from axolotl.utils.validation import validate_config
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src") src_dir = os.path.join(project_root, "src")
@@ -33,7 +32,7 @@ DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
def choose_device(cfg): def choose_device(cfg):
def get_device(): def get_device():
if torch.cuda.is_available(): if torch.cuda.is_available():
return "cuda" return f"cuda:{cfg.local_rank}"
else: else:
try: try:
if torch.backends.mps.is_available(): if torch.backends.mps.is_available():
@@ -69,7 +68,7 @@ def do_inference(cfg, model, tokenizer, prompter="AlpacaPrompter"):
instruction = get_multi_line_input() instruction = get_multi_line_input()
if not instruction: if not instruction:
return return
prompt = prompter_module().build_prompt(instruction=instruction) prompt: str = next(prompter_module().build_prompt(instruction=instruction))
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
model.eval() model.eval()
@@ -133,7 +132,8 @@ def train(
# then overwrite the value # then overwrite the value
cfg_keys = dict(cfg).keys() cfg_keys = dict(cfg).keys()
for k in kwargs: for k in kwargs:
if k in cfg_keys: # if not strict, allow writing to cfg even if it's not in the yml already
if k in cfg_keys or cfg.strict is False:
# handle booleans # handle booleans
if isinstance(cfg[k], bool): if isinstance(cfg[k], bool):
cfg[k] = bool(kwargs[k]) cfg[k] = bool(kwargs[k])
@@ -159,6 +159,8 @@ def train(
cfg.fp16 = True cfg.fp16 = True
cfg.bf16 = False cfg.bf16 = False
validate_config(cfg)
# Load the model and tokenizer # Load the model and tokenizer
logging.info("loading model, tokenizer, and peft_config...") logging.info("loading model, tokenizer, and peft_config...")
model, tokenizer, peft_config = load_model( model, tokenizer, peft_config = load_model(
@@ -171,6 +173,15 @@ def train(
inference=("inference" in kwargs), inference=("inference" in kwargs),
) )
if "merge_lora" in kwargs and cfg.adapter is not None:
logging.info("running merge of LoRA with base model")
model = model.merge_and_unload()
if cfg.local_rank == 0:
logging.info("saving merged model")
model.save_pretrained(str(Path(cfg.output_dir) / "merged"))
return
if "inference" in kwargs: if "inference" in kwargs:
logging.info("calling do_inference function") logging.info("calling do_inference function")
do_inference(cfg, model, tokenizer) do_inference(cfg, model, tokenizer)
@@ -184,10 +195,6 @@ def train(
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
) )
if prepare_ds_only:
logging.info("Finished preparing dataset. Exiting...")
return
if cfg.debug: if cfg.debug:
logging.info("check_dataset_labels...") logging.info("check_dataset_labels...")
check_dataset_labels( check_dataset_labels(
@@ -197,6 +204,10 @@ def train(
tokenizer, tokenizer,
) )
if prepare_ds_only:
logging.info("Finished preparing dataset. Exiting...")
return
trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer) trainer = setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer)
model.config.use_cache = False model.config.use_cache = False
@@ -218,6 +229,8 @@ def train(
) )
logging.info("Starting trainer...") logging.info("Starting trainer...")
if cfg.group_by_length:
logging.info("hang tight... sorting dataset for group_by_length")
resume_from_checkpoint = cfg.resume_from_checkpoint resume_from_checkpoint = cfg.resume_from_checkpoint
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints: if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
possible_checkpoints = [ possible_checkpoints = [
@@ -236,7 +249,9 @@ def train(
logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}") logging.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
model.save_pretrained(cfg.output_dir) # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
if cfg.local_rank == 0:
model.save_pretrained(cfg.output_dir)
# trainer.save_model(cfg.output_dir) # TODO this may be needed for deepspeed to work? need to review another time # trainer.save_model(cfg.output_dir) # TODO this may be needed for deepspeed to work? need to review another time

View File

@@ -106,7 +106,7 @@ class ConstantLengthDataset(IterableDataset):
} }
else: else:
logging.warning( logging.warning(
"dropping batch due to tensor size mismatch" f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
) )
buffer = {"input_ids": [], "attention_mask": [], "labels": []} buffer = {"input_ids": [], "attention_mask": [], "labels": []}
buffer_len = 0 buffer_len = 0

View File

@@ -0,0 +1,14 @@
import importlib
def load(strategy, tokenizer, cfg):
try:
load_fn = "load"
if strategy.split(".")[-1].startswith("load_"):
load_fn = strategy.split(".")[-1]
strategy = ".".join(strategy.split(".")[:-1])
m = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies")
fn = getattr(m, load_fn)
return fn(tokenizer, cfg)
except:
pass

View File

@@ -0,0 +1,32 @@
from axolotl.prompt_tokenizers import (
AlpacaPromptTokenizingStrategy,
InstructionPromptTokenizingStrategy,
)
from axolotl.prompters import AlpacaPrompter, PromptStyle
def load(tokenizer, cfg):
return AlpacaPromptTokenizingStrategy(
AlpacaPrompter(PromptStyle.chat.value),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str):
return (
prompt["question"],
"",
prompt["answer"],
)
def load_qa(tokenizer, cfg):
return AlpacaQAPromptTokenizingStrategy(
AlpacaPrompter(PromptStyle.chat.value),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)

View File

@@ -0,0 +1,11 @@
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter, PromptStyle
def load(tokenizer, cfg):
return AlpacaPromptTokenizingStrategy(
AlpacaPrompter(PromptStyle.instruct),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)

View File

@@ -0,0 +1,149 @@
from typing import Union, Generator
import yaml
from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
class CreativeAnsweringPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str):
question = prompt["instruction"]
answer = prompt[
"revision"
] # don't use prompt[answer], that's data we don't want in the dataset
return (
question,
"",
answer,
)
class CreativeCritiquePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
user_prompt = """Given the following Question and Response, critique the Response on a scale of 1-10. You should critique the answer in the following criteria:
refusal: whether the Response refuses to answer the Question. Responses containing language similar to "As an AI language model, I am not capable of ..." is a form of refusal and should have a low refusal score. 10 is no refusal, 0 is refuses to answer the question.
prescriptive bias: whether the Response attempts to prescribe or dictate certain actions or behaviors of the user. 10 no prescriptive bias present, 0 means there is prescriptive bias.
creativity: score how creative the Response is. 10 is most creative, 0 lacks creativity
comprehensiveness: score how comprehensive and in-depth the Response is. 10 is most comprehensive and in-depth, 0 lacks coverage and depth
Then for each criteria, give a detailed explanation of why the response doesn't meet the criteria (don't reference the score), what it did right, and how it could be improved for that criteria.
Output your answer in YAML format like so:
scores:
refusal: <0-10>
prescriptive_bias: <0-10>
creativity: <0-10>
comprehensiveness: <0-10>
critiques:
refusal:
explanation: ...
improvements: ...
prescriptive_bias:
explanation: ...
improvements: ...
creativity:
explanation: ...
improvements: ...
comprehensiveness:
explanation: ...
improvements: ...
Question: {question}
Answer: {answer}
"""
def parse_instruction_fields(self, prompt) -> (str, str, str):
scores = yaml.dump(
prompt["scores"], default_flow_style=False, Dumper=yaml.Dumper
)
critiques = yaml.dump(
prompt["critiques"], default_flow_style=False, Dumper=yaml.Dumper
)
evaluation = scores + critiques
question = prompt["instruction"]
answer = prompt["answer"]
return (
self.user_prompt.format(question=question, answer=answer),
"",
evaluation,
)
class CreativeRevisePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
user_prompt = """Definitions:
refusal: whether the Response refuses to answer the Question. Responses containing language similar to "As an AI language model, I am not capable of ..." is a form of refusal and should have a low refusal score. 10 is no refusal, 0 is refuses to answer the question.
prescriptive bias: whether the Response attempts to prescribe or dictate certain actions or behaviors of the user. 10 no prescriptive bias present, 0 means their is prescriptive bias.
creativity: score how creative the Response is. 10 is most creative, 0 lacks creativity
comprehensiveness: score how comprehensive and in-depth the Response is. 10 is most comprehensive and in-depth, 0 lacks coverage and depth
Given the following Question, Response, and Evaluation, revise the Response based on the Evaluation and recommendations for improvements. Reply only with the revised response.
Question: {question}
Answer: {answer}
Evaluation:
{evaluation}
"""
def parse_instruction_fields(self, prompt) -> (str, str, str):
scores = yaml.dump(
prompt["scores"], default_flow_style=False, Dumper=yaml.Dumper
)
critiques = yaml.dump(
prompt["critiques"], default_flow_style=False, Dumper=yaml.Dumper
)
evaluation = scores + critiques
question = prompt["instruction"]
answer = prompt["answer"]
return (
self.user_prompt.format(
question=question, answer=answer, evaluation=evaluation
),
"",
prompt["revision"],
)
class CreativePrompterBase:
system_prompt = ""
prompt_input = "{system_prompt}\nUSER: {instruction}\nASSISTANT:"
def build_prompt(
self,
instruction: str,
input: Union[None, str] = None,
output: Union[None, str] = None,
) -> Generator[str, None, None]:
if self.system_prompt:
res = f"{self.system_prompt}\nUSER: {instruction}\nASSISTANT:"
else:
res = f"USER: {instruction}\nASSISTANT:"
if output:
res = f"{res}{output}"
yield res
class CreativeAnswerPrompter(CreativePrompterBase):
system_prompt = "Answer the following question in a comprehensive, in-depth, and creative way. Additionally your response should be relevant, accurate, and free of any ambiguity."
class CreativeCritiquePrompter(CreativePrompterBase):
system_prompt = ""
class CreativeRevisePrompter(CreativePrompterBase):
system_prompt = ""
def load_answer(tokenizer, cfg):
return CreativeAnsweringPromptTokenizingStrategy(
CreativeAnswerPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
def load_critique(tokenizer, cfg):
return CreativeCritiquePromptTokenizingStrategy(
CreativeCritiquePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)
def load_revise(tokenizer, cfg):
return CreativeRevisePromptTokenizingStrategy(
CreativeRevisePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)

View File

@@ -0,0 +1,110 @@
import copy
import logging
from collections import defaultdict
from typing import Generator
from axolotl.prompt_tokenizers import PromptTokenizingStrategy
IGNORE_TOKEN_ID = -100
class PygmalionPromptTokenizingStrategy(PromptTokenizingStrategy):
bot_prefix_token_ids = []
def __init__(self, prompter, tokenizer, *args, **kwargs):
super().__init__(prompter, tokenizer)
res = self._tokenize("<|model|>", add_eos_token=False, strip_bos_token=True)
self.bot_prefix_token_ids = res["input_ids"]
def tokenize_prompt(self, prompt):
result = {
"input_ids": [],
"attention_mask": [],
"labels": [],
}
current_len = 0
for i, part in enumerate(self.prompter.build_prompt(prompt["conversations"])):
role, message = part
if role == "system":
prefix = "<|system|>"
# this should include a bos token, no eos token, strip trailing "\n<START>"
if message.endswith("\n<START>"):
message = message[:-8]
res = self._tokenize(
prefix + "Persona: " + message.strip(),
add_eos_token=False,
strip_bos_token=False,
)
# everything from this is masked out from the labels
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
elif role == "human":
prefix = "<|user|>"
res = self._tokenize(
prefix + " " + message.strip(),
add_eos_token=False,
strip_bos_token=True,
)
# everything from this is masked out from the labels
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
elif role == "bot":
prefix = "<|model|>"
res = self._tokenize(
prefix + " " + message.strip(),
add_eos_token=True,
strip_bos_token=True,
)
# mask out the prefix token, rest is not masked out from labels
# make sure we create the labels first, otherwise we get incorrect lengths
labels = [IGNORE_TOKEN_ID] * len(self.bot_prefix_token_ids) + [
*copy.deepcopy(res["input_ids"])
][len(self.bot_prefix_token_ids) :]
else:
logging.warning(f"unknown role in conversation: {role}")
res = defaultdict(lambda: [])
input_ids = res["input_ids"]
input_len = len(input_ids)
result["input_ids"][current_len : current_len + input_len] = input_ids
result["attention_mask"][current_len : current_len + input_len] = [
1 if x != self.tokenizer.pad_token_id else 0 for x in input_ids
]
result["labels"][current_len : current_len + input_len] = labels
current_len += input_len
return result
def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
result = self.tokenizer(
prompt,
truncation=True,
max_length=self.sequence_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != self.tokenizer.eos_token_id
and len(result["input_ids"]) < self.sequence_len
and add_eos_token
):
result["input_ids"].append(self.tokenizer.eos_token_id)
result["attention_mask"].append(1)
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
result["input_ids"] = result["input_ids"][1:]
result["attention_mask"] = result["attention_mask"][1:]
result["labels"] = result["input_ids"].copy()
return result
class PygmalionPrompter:
def __init__(self, *args, **kwargs):
pass
def build_prompt(self, source, *args, **kwargs) -> Generator[str, None, None]:
for msg in source:
yield msg["role"], msg["value"]
def load(tokenizer, cfg):
return PygmalionPromptTokenizingStrategy(
PygmalionPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
)

View File

@@ -1,7 +1,12 @@
import abc import abc
import copy
import functools
import logging
from transformers import PreTrainedTokenizer from transformers import PreTrainedTokenizer
from axolotl.prompters import IGNORE_TOKEN_ID
IGNORE_INDEX = -100 IGNORE_INDEX = -100
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" LLAMA_DEFAULT_PAD_TOKEN = "[PAD]"
LLAMA_DEFAULT_EOS_TOKEN = "</s>" LLAMA_DEFAULT_EOS_TOKEN = "</s>"
@@ -30,6 +35,20 @@ class PromptTokenizingStrategy(abc.ABC):
def tokenize_prompt(self, prompt): def tokenize_prompt(self, prompt):
pass pass
@functools.cache
def _get_user_token(self):
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|USER|>")
if isinstance(id_or_ids, (int,)):
return id_or_ids
return False
@functools.cache
def _get_assistant_token(self):
id_or_ids = self.tokenizer.convert_tokens_to_ids("<|ASSISTANT|>")
if isinstance(id_or_ids, (int,)):
return id_or_ids
return False
class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy): class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str): def parse_instruction_fields(self, prompt) -> (str, str, str):
@@ -40,9 +59,13 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
full_prompt = self._build_full_prompt(instruction, input, response) full_prompt = self._build_full_prompt(instruction, input, response)
tokenized_full_prompt = self._tokenize(full_prompt) tokenized_full_prompt = self._tokenize(full_prompt)
if not self.train_on_inputs: if not self.train_on_inputs:
user_prompt = self.prompter.build_prompt( user_prompt = next(
instruction, iter(
input, self.prompter.build_prompt(
instruction,
input,
)
)
) )
tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False) tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False)
user_prompt_len = len(tokenized_user_prompt["input_ids"]) user_prompt_len = len(tokenized_user_prompt["input_ids"])
@@ -54,13 +77,17 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
return tokenized_full_prompt return tokenized_full_prompt
def _build_full_prompt(self, instruction, input, response): def _build_full_prompt(self, instruction, input, response):
return self.prompter.build_prompt( return next(
instruction, iter(
input, self.prompter.build_prompt(
response, instruction,
input,
response,
)
)
) )
def _tokenize(self, prompt, add_eos_token=True): def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
result = self.tokenizer( result = self.tokenizer(
prompt, prompt,
truncation=True, truncation=True,
@@ -76,6 +103,10 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
result["input_ids"].append(self.tokenizer.eos_token_id) result["input_ids"].append(self.tokenizer.eos_token_id)
result["attention_mask"].append(1) result["attention_mask"].append(1)
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
result["input_ids"] = result["input_ids"][1:]
result["attention_mask"] = result["attention_mask"][1:]
result["labels"] = result["input_ids"].copy() result["labels"] = result["input_ids"].copy()
return result return result
@@ -89,6 +120,15 @@ class AlpacaPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
) )
class AlpacaMultipleChoicePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str):
return (
prompt["question"],
"\n".join(f'- "{choice}"' for choice in prompt["choices"]),
prompt["solution"] if "solution" in prompt else prompt["explanation"],
)
class JeopardyPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): class JeopardyPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str): def parse_instruction_fields(self, prompt) -> (str, str, str):
return ( return (
@@ -107,6 +147,15 @@ class OpenAssistantPromptTokenizingStrategy(InstructionPromptTokenizingStrategy)
) )
class SummarizeTLDRPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str):
return (
prompt["article"],
"",
prompt["summary"],
)
class GPTeacherPromptTokenizingStrategy(InstructionPromptTokenizingStrategy): class GPTeacherPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
def parse_instruction_fields(self, prompt) -> (str, str, str): def parse_instruction_fields(self, prompt) -> (str, str, str):
return ( return (
@@ -131,13 +180,13 @@ class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
def tokenize_prompt(self, prompt): def tokenize_prompt(self, prompt):
instruction = self.parse_instruction_fields(prompt) instruction = self.parse_instruction_fields(prompt)
full_prompt = self._build_full_prompt(instruction) full_prompt = self._build_full_prompt(instruction, None, None)
tokenized_full_prompt = self._tokenize(full_prompt) tokenized_full_prompt = self._tokenize(full_prompt)
return tokenized_full_prompt return tokenized_full_prompt
def _build_full_prompt(self, instruction): def _build_full_prompt(self, instruction, input, response):
return self.prompter.build_prompt(instruction) return next(iter(self.prompter.build_prompt(instruction)))
class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy): class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
@@ -157,9 +206,13 @@ class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
) )
tokenized_full_prompt = self._tokenize(full_prompt) tokenized_full_prompt = self._tokenize(full_prompt)
if not self.train_on_inputs: if not self.train_on_inputs:
user_prompt = self.prompter.build_prompt( user_prompt = next(
instruction, iter(
input, self.prompter.build_prompt(
instruction,
input,
)
)
) )
tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False) tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False)
user_prompt_len = len(tokenized_user_prompt["input_ids"]) user_prompt_len = len(tokenized_user_prompt["input_ids"])
@@ -171,12 +224,16 @@ class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
return tokenized_full_prompt return tokenized_full_prompt
def _build_full_prompt(self, instruction, input, output, reflection, corrected): def _build_full_prompt(self, instruction, input, output, reflection, corrected):
return self.prompter.build_prompt( return next(
instruction, iter(
input, self.prompter.build_prompt(
output, instruction,
reflection, input,
corrected, output,
reflection,
corrected,
)
)
) )
def _tokenize(self, prompt, add_eos_token=True): def _tokenize(self, prompt, add_eos_token=True):
@@ -212,7 +269,80 @@ class AlpacaReflectionPTStrategy(ReflectionPromptTokenizingStrategy):
class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy): class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
def tokenize_prompt(self, prompt): def tokenize_prompt(self, prompt):
result = {
"input_ids": [],
"attention_mask": [],
"labels": [],
}
current_len = 0
user_token = self._get_user_token()
assistant_token = self._get_assistant_token()
try: try:
return self.prompter.build_prompt(prompt["conversations"], self.tokenizer) for i, part in enumerate(
self.prompter.build_prompt(prompt["conversations"])
):
if isinstance(part, tuple):
if part[0] == "USER:":
part = part[0] + part[1] if not user_token else part[1]
# this is still the user query, we should
res = self._tokenize(
part.strip(), add_eos_token=False, strip_bos_token=True
)
if user_token:
res["input_ids"] = [user_token, *res["input_ids"]]
# everything from this is masked out from the labels
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
elif part[0] == "ASSISTANT:":
# TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
part = part[0] + part[1] if not assistant_token else part[1]
# this should be the assistent response, should end with an eos token
res = self._tokenize(
part.strip(), add_eos_token=True, strip_bos_token=True
)
if assistant_token:
res["input_ids"] = [assistant_token, *res["input_ids"]]
# not masked out from labels
labels = copy.deepcopy(res["input_ids"])
else:
logging.warning("unhandled role: " + part[0])
else:
# this is only ever the first part, should include the bos token and the user query
res = self._tokenize(
part.strip(), add_eos_token=False, strip_bos_token=False
)
# everything from this is masked out from the labels
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
input_ids = res["input_ids"]
input_len = len(input_ids)
result["input_ids"][current_len : current_len + input_len] = input_ids
result["attention_mask"][current_len : current_len + input_len] = [
1 if x != self.tokenizer.pad_token_id else 0 for x in input_ids
]
result["labels"][current_len : current_len + input_len] = labels
current_len += input_len
return result
except (KeyError, AssertionError, IndexError) as e: except (KeyError, AssertionError, IndexError) as e:
raise InvalidDataException(str(e)) raise InvalidDataException(str(e))
def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
result = self.tokenizer(
prompt,
truncation=True,
max_length=self.sequence_len,
padding=False,
return_tensors=None,
)
if (
result["input_ids"][-1] != self.tokenizer.eos_token_id
and len(result["input_ids"]) < self.sequence_len
and add_eos_token
):
result["input_ids"].append(self.tokenizer.eos_token_id)
result["attention_mask"].append(1)
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
result["input_ids"] = result["input_ids"][1:]
result["attention_mask"] = result["attention_mask"][1:]
result["labels"] = result["input_ids"].copy()
return result

View File

@@ -1,22 +1,52 @@
import copy import copy
import dataclasses import dataclasses
import logging
from enum import auto, Enum from enum import auto, Enum
from typing import List, Tuple, Any, Union from typing import List, Tuple, Any, Union, Generator
IGNORE_TOKEN_ID = -100 IGNORE_TOKEN_ID = -100
class PromptStyle(Enum):
instruct = "instruct"
chat = "chat"
class AlpacaPrompter: class AlpacaPrompter:
prompt_input = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"
prompt_no_input = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n" system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
response_split = "### Response:" prompt_style = None
def __init__(self, prompt_style="instruct"):
self.prompt_style = prompt_style
self.match_prompt_style()
def match_prompt_style(self):
if self.prompt_style == PromptStyle.instruct.value:
self.prompt_input = (
self.system_prompt
+ "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)
self.prompt_no_input = (
self.system_no_input_prompt
+ "### Instruction:\n{instruction}\n\n### Response:\n"
)
self.response_split = "### Response:"
if self.prompt_style == PromptStyle.chat.value:
self.prompt_input = (
self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
)
self.prompt_no_input = (
self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
)
self.response_split = "ASSISTANT:"
def build_prompt( def build_prompt(
self, self,
instruction: str, instruction: str,
input: Union[None, str] = None, input: Union[None, str] = None,
output: Union[None, str] = None, output: Union[None, str] = None,
) -> str: ) -> Generator[str, None, None]:
# returns the full prompt from instruction and optional input # returns the full prompt from instruction and optional input
# if a label (=response, =output) is provided, it's also appended. # if a label (=response, =output) is provided, it's also appended.
if input: if input:
@@ -25,19 +55,42 @@ class AlpacaPrompter:
res = self.prompt_no_input.format(instruction=instruction) res = self.prompt_no_input.format(instruction=instruction)
if output: if output:
res = f"{res}{output}" res = f"{res}{output}"
return res yield res
def get_response(self, output: str) -> str: def get_response(self, output: str) -> str:
return output.split(self.response_split)[1].strip() return output.split(self.response_split)[1].strip()
class UnpromptedPrompter(AlpacaPrompter):
system_prompt = ""
system_no_input_prompt = ""
class JeopardyPrompter(AlpacaPrompter): class JeopardyPrompter(AlpacaPrompter):
prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
class MultipleChoiceExplainPrompter(AlpacaPrompter):
system_prompt = (
"Choose the answer that best answers the question. Explain your reasoning."
)
class MultipleChoiceConcisePrompter(AlpacaPrompter):
prompt_input = "Choose the answer that best answers the question. Be concise in your response.\n\nUSER: {instruction}\n{input}\nASSISTANT:\n"
class SummarizeTLDRPrompter(AlpacaPrompter):
prompt_no_input = (
"USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
)
class CompletionPrompter(AlpacaPrompter): class CompletionPrompter(AlpacaPrompter):
def build_prompt(self, instruction: str) -> str: def build_prompt(
return instruction self, instruction: str, input=None, output=None
) -> Generator[str, None, None]:
yield instruction
def get_response(self, output: str) -> str: def get_response(self, output: str) -> str:
return output.strip() return output.strip()
@@ -52,11 +105,44 @@ class NomicGPT4AllPrompter(AlpacaPrompter):
class ReflectAlpacaPrompter: class ReflectAlpacaPrompter:
prompt_input = "Below is an instruction that describes a task, paired with an input that provides further context. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n" system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n"
prompt_no_input = "Below is an instruction that describes a task. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n### Instruction:\n{instruction}\n\n### Response:\n" system_no_input_prompt = "Below is an instruction that describes a task. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n"
agent_label = "{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
prompt_input = (
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)
prompt_no_input = "### Instruction:\n{instruction}\n\n### Response:\n"
agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
response_split = "### Response:" response_split = "### Response:"
def __init__(self, prompt_style="instruct"):
self.prompt_style = prompt_style
self.match_prompt_style()
def match_prompt_style(self):
if self.prompt_style == PromptStyle.instruct.value:
self.prompt_input = (
self.system_prompt
+ "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
)
self.prompt_no_input = (
self.system_no_input_prompt
+ "### Instruction:\n{instruction}\n\n### Response:\n"
)
self.agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
self.response_split = "### Final Response:"
if self.prompt_style == PromptStyle.chat.value:
self.prompt_input = (
self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
)
self.prompt_no_input = (
self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
)
self.agent_label = (
"\nTHOUGHT: {output}\nASSISTANT REFLECTION: {reflection}\nASSISTANT:"
)
self.response_split = "ASSISTANT:"
def build_prompt( def build_prompt(
self, self,
instruction: str, instruction: str,
@@ -64,7 +150,7 @@ class ReflectAlpacaPrompter:
output: Union[None, str] = None, output: Union[None, str] = None,
reflection: Union[None, str] = None, reflection: Union[None, str] = None,
corrected: Union[None, str] = None, corrected: Union[None, str] = None,
) -> str: ) -> Generator[str, None, None]:
# returns the full prompt from instruction and optional input # returns the full prompt from instruction and optional input
# if a label (=response, =output) is provided, it's also appended. # if a label (=response, =output) is provided, it's also appended.
if input: if input:
@@ -76,7 +162,7 @@ class ReflectAlpacaPrompter:
output=output, reflection=reflection, corrected=corrected output=output, reflection=reflection, corrected=corrected
) )
res = f"{res}{label}" res = f"{res}{label}"
return res yield res
def get_response(self, output: str) -> str: def get_response(self, output: str) -> str:
return output.split(self.response_split)[1].strip() return output.split(self.response_split)[1].strip()
@@ -103,15 +189,16 @@ class Conversation:
sep: str = "###" sep: str = "###"
sep2: str = None sep2: str = None
def get_prompt(self): def get_prompt(self) -> Generator[str, None, None]:
seps = [self.sep, self.sep2] seps = [self.sep, self.sep2]
ret = self.system + seps[0] preamble = self.system + seps[0]
yield preamble
for i, (role, message) in enumerate(self.messages): for i, (role, message) in enumerate(self.messages):
if message: if message:
ret += role + ": " + message + seps[i % 2] yield (role + ":", " " + message)
else: else:
ret += role + ":" logging.warning("role with empty message: " + role)
return ret yield (role + ":",)
def copy(self): def copy(self):
return Conversation( return Conversation(
@@ -136,12 +223,24 @@ conv_vicuna_v1_1 = Conversation(
offset=0, offset=0,
sep_style=SeparatorStyle.TWO, sep_style=SeparatorStyle.TWO,
sep=" ", sep=" ",
sep2="</s>", sep2=" ",
) )
class ShareGPTPrompter: class ShareGPTPrompter:
def build_prompt(self, source, tokenizer, sequence_len=2048): def __init__(self, prompt_style=None):
if prompt_style != PromptStyle.chat.value:
raise Exception(
f"unsupported prompt_style for ShareGPTPrompter({prompt_style})"
)
# def match_prompt_style(self):
# if self.prompt_style == PromptStyle.chat.value:
# self.prompt_input = self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
# self.prompt_no_input = self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
# self.response_split = "ASSISTANT:"
def build_prompt(self, source, *args, **kwargs) -> Generator[str, None, None]:
# ignore the system prompt if provided # ignore the system prompt if provided
if source[0]["from"] == "system": if source[0]["from"] == "system":
source.pop(0) source.pop(0)
@@ -171,61 +270,6 @@ class ShareGPTPrompter:
role = roles[sentence["from"]] role = roles[sentence["from"]]
assert role == conv.roles[j % 2] assert role == conv.roles[j % 2]
conv.append_message(role, sentence["value"]) conv.append_message(role, sentence["value"])
# TODO, this concatenates everything, but doesn't seem to properly add the eos_token_id, as the eos_token gets split up
conversation = conv.get_prompt()
# Tokenize conversations for part in conv.get_prompt():
tokenized_result = tokenizer( yield part
conversation,
truncation=True,
max_length=sequence_len, # FIXME
padding=False,
return_tensors=None,
)
target = copy.deepcopy(tokenized_result["input_ids"])
# Mask targets
sep = conv.sep + conv.roles[1] + ": "
rounds = conversation.split(conv.sep2)
rounds = [r + conv.sep2 for r in rounds]
cur_len = 1
target[0] = IGNORE_TOKEN_ID # mask out the bos
for i, rou in enumerate(rounds):
if rou == "":
break
parts = rou.split(sep)
if len(parts) != 2:
break
parts[0] += sep
round_len = (
len(tokenizer(rou)["input_ids"]) - 1
) # -1 ignores the bos_token generated for this
# we have to strip the initial part, any dangling whitespace creates an additional ghost token
instruction_len = (
len(tokenizer(parts[0].strip())["input_ids"]) - 1
) # -1 ignores the bos_token generated for this
target[cur_len : cur_len + instruction_len] = [
IGNORE_TOKEN_ID
] * instruction_len
cur_len += round_len
if cur_len >= sequence_len:
break
# Fix: Truncate the target to have the same length as input_ids
target = target[: len(tokenized_result["input_ids"])]
# target[cur_len:] = [IGNORE_TOKEN_ID] * (len(target) - cur_len)
attention_mask = [
1 if x != tokenizer.pad_token_id else 0
for x in tokenized_result["input_ids"]
]
# TODO truncate len to sequence_len
return dict(
input_ids=tokenized_result["input_ids"],
labels=target,
attention_mask=attention_mask,
)

View File

@@ -8,10 +8,13 @@ from datasets import (
IterableDataset, IterableDataset,
Dataset, Dataset,
concatenate_datasets, concatenate_datasets,
DatasetDict,
) )
from huggingface_hub import hf_hub_download from huggingface_hub import hf_hub_download
from transformers import PreTrainedTokenizerBase
from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset
from axolotl.prompt_strategies import load
from axolotl.prompt_tokenizers import ( from axolotl.prompt_tokenizers import (
AlpacaPromptTokenizingStrategy, AlpacaPromptTokenizingStrategy,
GPTeacherPromptTokenizingStrategy, GPTeacherPromptTokenizingStrategy,
@@ -20,6 +23,8 @@ from axolotl.prompt_tokenizers import (
ShareGPTPromptTokenizingStrategy, ShareGPTPromptTokenizingStrategy,
JeopardyPromptTokenizingStrategy, JeopardyPromptTokenizingStrategy,
CompletionPromptTokenizingStrategy, CompletionPromptTokenizingStrategy,
AlpacaMultipleChoicePromptTokenizingStrategy,
SummarizeTLDRPromptTokenizingStrategy,
) )
from axolotl.prompters import ( from axolotl.prompters import (
AlpacaPrompter, AlpacaPrompter,
@@ -28,16 +33,24 @@ from axolotl.prompters import (
ShareGPTPrompter, ShareGPTPrompter,
JeopardyPrompter, JeopardyPrompter,
CompletionPrompter, CompletionPrompter,
MultipleChoiceExplainPrompter,
SummarizeTLDRPrompter,
MultipleChoiceConcisePrompter,
) )
def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_path): def load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path
) -> DatasetDict:
tokenizer_name = tokenizer.__class__.__name__
ds_hash = str( ds_hash = str(
md5( md5(
( (
str(cfg.sequence_len) str(cfg.sequence_len)
+ "@" + "@"
+ "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets])) + "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets]))
+ "|"
+ tokenizer_name
).encode("utf-8") ).encode("utf-8")
).hexdigest() ).hexdigest()
) )
@@ -46,8 +59,19 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
if cfg.dataset_prepared_path if cfg.dataset_prepared_path
else Path(default_dataset_prepared_path) / ds_hash else Path(default_dataset_prepared_path) / ds_hash
) )
dataset = None
try:
if cfg.push_dataset_to_hub:
dataset = load_dataset(
f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True
)
dataset = dataset["train"]
except:
pass
if any(prepared_ds_path.glob("*")): if dataset:
...
elif any(prepared_ds_path.glob("*")):
logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...") logging.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
dataset = load_from_disk(str(prepared_ds_path)) dataset = load_from_disk(str(prepared_ds_path))
logging.info("Prepared dataset loaded from disk...") logging.info("Prepared dataset loaded from disk...")
@@ -59,7 +83,7 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
ds = None ds = None
ds_from_hub = False ds_from_hub = False
try: try:
load_dataset(d.path, streaming=True) load_dataset(d.path, streaming=True, use_auth_token=True)
ds_from_hub = True ds_from_hub = True
except FileNotFoundError: except FileNotFoundError:
pass pass
@@ -67,64 +91,117 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
# prefer local dataset, even if hub exists # prefer local dataset, even if hub exists
if Path(d.path).exists(): if Path(d.path).exists():
ds: IterableDataset = load_dataset( ds: IterableDataset = load_dataset(
"json", data_files=d.path, streaming=True, split=None "json", data_files=d.path, streaming=False, split=None
) )
elif ds_from_hub: elif ds_from_hub:
if d.data_files: if d.data_files:
ds = load_dataset(d.path, streaming=True, data_files=d.data_files) ds = load_dataset(
d.path,
streaming=False,
data_files=d.data_files,
use_auth_token=True,
)
else: else:
ds = load_dataset(d.path, streaming=True) ds = load_dataset(d.path, streaming=False, use_auth_token=True)
else: else:
fp = hf_hub_download( fp = hf_hub_download(
repo_id=d.path, repo_type="dataset", filename=d.data_files repo_id=d.path, repo_type="dataset", filename=d.data_files
) )
ds = load_dataset("json", data_files=fp, streaming=True, split=None) ds = load_dataset("json", data_files=fp, streaming=False, split=None)
if not ds: if not ds:
raise Exception("unhandled dataset load") raise Exception("unhandled dataset load")
# support for using a subset of the data
if d.type == "alpaca": if d.shards:
ds = ds.shuffle(seed=42)["train"].shard(num_shards=cfg.shards, index=0)
d_type = d.type
d_type_split = d_type.split(":")
d_base_type = d_type_split[0]
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
if ds_strategy := load(d.type, tokenizer, cfg):
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d_base_type == "alpaca":
ds_strategy = AlpacaPromptTokenizingStrategy( ds_strategy = AlpacaPromptTokenizingStrategy(
AlpacaPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len AlpacaPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
) )
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"]) ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
elif d.type == "jeopardy": elif d_base_type == "explainchoice":
ds_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
MultipleChoiceExplainPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d_base_type == "concisechoice":
ds_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
MultipleChoiceConcisePrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d_base_type == "summarizetldr":
ds_strategy = SummarizeTLDRPromptTokenizingStrategy(
SummarizeTLDRPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
)
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper)
elif d_base_type == "jeopardy":
ds_strategy = JeopardyPromptTokenizingStrategy( ds_strategy = JeopardyPromptTokenizingStrategy(
JeopardyPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len JeopardyPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
) )
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"]) ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
elif d.type == "oasst": elif d_base_type == "oasst":
ds_strategy = OpenAssistantPromptTokenizingStrategy( ds_strategy = OpenAssistantPromptTokenizingStrategy(
AlpacaPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len AlpacaPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
) )
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"]) ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
elif d.type == "gpteacher": elif d_base_type == "gpteacher":
ds_strategy = GPTeacherPromptTokenizingStrategy( ds_strategy = GPTeacherPromptTokenizingStrategy(
GPTeacherPrompter(), GPTeacherPrompter(d_prompt_style),
tokenizer, tokenizer,
cfg.train_on_inputs, cfg.train_on_inputs,
cfg.sequence_len, cfg.sequence_len,
) )
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"]) ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
elif d.type == "reflection": elif d_base_type == "reflection":
ds_strategy = AlpacaReflectionPTStrategy( ds_strategy = AlpacaReflectionPTStrategy(
ReflectAlpacaPrompter(), ReflectAlpacaPrompter(d_prompt_style),
tokenizer, tokenizer,
cfg.train_on_inputs, cfg.train_on_inputs,
cfg.sequence_len, cfg.sequence_len,
) )
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"]) ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
elif d.type == "sharegpt": elif d_base_type == "sharegpt":
ds_strategy = ShareGPTPromptTokenizingStrategy( ds_strategy = ShareGPTPromptTokenizingStrategy(
ShareGPTPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len ShareGPTPrompter(d_prompt_style),
tokenizer,
cfg.train_on_inputs,
cfg.sequence_len,
) )
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"]) ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
elif d.type == "completion": elif d_base_type == "completion":
ds_strategy = CompletionPromptTokenizingStrategy( ds_strategy = CompletionPromptTokenizingStrategy(
CompletionPrompter(), CompletionPrompter(),
tokenizer, tokenizer,
@@ -146,11 +223,20 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
f"Saving merged prepared dataset to disk... {prepared_ds_path}" f"Saving merged prepared dataset to disk... {prepared_ds_path}"
) )
dataset.save_to_disk(prepared_ds_path) dataset.save_to_disk(prepared_ds_path)
if cfg.push_dataset_to_hub:
logging.info(
f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset.push_to_hub(
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
)
return dataset return dataset
def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path): def load_prepare_datasets(
tokenizer: PreTrainedTokenizerBase, cfg, default_dataset_prepared_path
) -> (Dataset, Dataset):
max_packed_sequence_len = ( max_packed_sequence_len = (
cfg.max_packed_sequence_len if cfg.max_packed_sequence_len else cfg.sequence_len cfg.max_packed_sequence_len if cfg.max_packed_sequence_len else cfg.sequence_len
) )
@@ -158,16 +244,20 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
max_packed_sequence_len, cfg.sequence_len max_packed_sequence_len, cfg.sequence_len
) # make sure we don't accidentally set it larger than sequence_len ) # make sure we don't accidentally set it larger than sequence_len
tokenizer_name = tokenizer.__class__.__name__
if cfg.max_packed_sequence_len is not None: if cfg.max_packed_sequence_len is not None:
# see if we can go ahead and load the stacked dataset # see if we can go ahead and load the stacked dataset
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
ds_hash = str( ds_hash = str(
md5( md5(
( (
str(cfg.sequence_len) str(cfg.sequence_len)
+ "@" + "@"
+ str(max_packed_sequence_len) + str(max_packed_sequence_len)
+ seed
+ "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets])) + "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets]))
+ "|"
+ tokenizer_name
).encode("utf-8") ).encode("utf-8")
).hexdigest() ).hexdigest()
) )
@@ -177,17 +267,42 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
else Path(default_dataset_prepared_path) / ds_hash else Path(default_dataset_prepared_path) / ds_hash
) )
if any(prepared_ds_path.glob("*")): dataset = None
try:
if cfg.push_dataset_to_hub:
logging.info(
f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset = load_dataset(
f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True
)
dataset = dataset["train"]
except:
pass
if dataset:
...
elif any(prepared_ds_path.glob("*")):
logging.info( logging.info(
f"Loading prepared packed dataset from disk at {prepared_ds_path}..." f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
) )
dataset = load_from_disk(str(prepared_ds_path)) dataset = load_from_disk(str(prepared_ds_path))
logging.info("Prepared packed dataset loaded from disk...") logging.info("Prepared packed dataset loaded from disk...")
if cfg.push_dataset_to_hub:
logging.info(
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset.push_to_hub(
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
)
else: else:
dataset = load_tokenized_prepared_datasets( dataset = load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path tokenizer, cfg, default_dataset_prepared_path
) )
if cfg.seed:
dataset = dataset.shuffle(seed=cfg.seed)
constant_len_dataset = ConstantLengthDataset( constant_len_dataset = ConstantLengthDataset(
tokenizer, tokenizer,
[dataset], [dataset],
@@ -204,9 +319,9 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
d d
for d in dataset for d in dataset
if len(d["input_ids"]) < cfg.sequence_len if len(d["input_ids"]) < cfg.sequence_len
and len(d["input_ids"]) > 0 and len(d["input_ids"]) > 0
and len(d["input_ids"]) == len(d["attention_mask"]) and len(d["input_ids"]) == len(d["attention_mask"])
and len(d["input_ids"]) == len(d["labels"]) and len(d["input_ids"]) == len(d["labels"])
] ]
) )
@@ -215,6 +330,13 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
f"Saving packed prepared dataset to disk... {prepared_ds_path}" f"Saving packed prepared dataset to disk... {prepared_ds_path}"
) )
dataset.save_to_disk(prepared_ds_path) dataset.save_to_disk(prepared_ds_path)
if cfg.push_dataset_to_hub:
logging.info(
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset.push_to_hub(
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
)
else: else:
dataset = load_tokenized_prepared_datasets( dataset = load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path tokenizer, cfg, default_dataset_prepared_path

View File

@@ -1,15 +1,18 @@
import logging import logging
import math
import os import os
from pathlib import Path from pathlib import Path
from typing import Optional, Tuple, TYPE_CHECKING from typing import Optional, Tuple, TYPE_CHECKING
import torch import torch
import transformers import transformers
from torch import nn
from transformers import ( from transformers import (
AutoModelForCausalLM, AutoModelForCausalLM,
AutoTokenizer, AutoTokenizer,
PreTrainedModel, PreTrainedModel,
AutoConfig, AutoConfig,
BitsAndBytesConfig,
) )
try: try:
@@ -80,6 +83,16 @@ def load_model(
logging.exception(e) logging.exception(e)
raise e raise e
model_kwargs = {}
if cfg.adapter == "qlora":
model_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_4bit=True,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=False,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
)
try: try:
if cfg.load_4bit and is_llama_derived_model: if cfg.load_4bit and is_llama_derived_model:
from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram from alpaca_lora_4bit.autograd_4bit import load_llama_model_4bit_low_ram
@@ -123,16 +136,46 @@ def load_model(
model = LlamaForCausalLM.from_pretrained( model = LlamaForCausalLM.from_pretrained(
base_model, base_model,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
device_map=cfg.device_map, device_map=cfg.device_map,
**model_kwargs,
) )
# elif model_type == "GPTNeoXForCausalLM" and cfg.flash_attention:
# This is a WIP, still an issue with the backward pass
# RuntimeError: grad can be implicitly created only for scalar outputs
# TODO: try config.sequence_parallel = False
# # https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/tests/models/test_gpt_neox.py#L12
# # https://github.com/HazyResearch/flash-attention/tree/main/training#model-components
# # add `**kwargs` to https://github.com/HazyResearch/flash-attention/blob/40a25c8ee7465cf547b929cfa2937034e37bfce9/flash_attn/models/gpt.py#L442
# from flash_attn.utils.pretrained import state_dict_from_pretrained
# from flash_attn.models.gpt import GPTLMHeadModel
# from flash_attn.models.gpt_neox import remap_state_dict_hf_gpt_neox, gpt_neox_config_to_gpt2_config
# from transformers import GPTNeoXConfig
# config = gpt_neox_config_to_gpt2_config(GPTNeoXConfig.from_pretrained(base_model))
# config.use_flash_attn = True
# config.fused_bias_fc = True
# config.fused_mlp = True # GPT-NeoX-20B uses "gelu_fast"
# config.activation_function = "gelu_fast"
# config.fused_dropout_add_ln = True
# # config.residual_in_fp32 = True
#
# model: GPTLMHeadModel = GPTLMHeadModel.from_pretrained(
# base_model,
# config,
# dtype=torch_dtype,
# device=cfg.device,
# )
# model.train() # sets to train instead of eval mode
elif model_type: elif model_type:
model = getattr(transformers, model_type).from_pretrained( model = getattr(transformers, model_type).from_pretrained(
base_model, base_model,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
device_map=cfg.device_map, device_map=cfg.device_map,
trust_remote_code=True if cfg.trust_remote_code is True else False, trust_remote_code=True if cfg.trust_remote_code is True else False,
**model_kwargs,
) )
else: else:
config = AutoConfig.from_pretrained( config = AutoConfig.from_pretrained(
@@ -143,9 +186,11 @@ def load_model(
base_model, base_model,
config=config, config=config,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
device_map=cfg.device_map, device_map=cfg.device_map,
trust_remote_code=True if cfg.trust_remote_code is True else False, trust_remote_code=True if cfg.trust_remote_code is True else False,
**model_kwargs,
) )
except Exception as e: except Exception as e:
logging.error( logging.error(
@@ -158,16 +203,26 @@ def load_model(
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
device_map=cfg.device_map, device_map=cfg.device_map,
trust_remote_code=True if cfg.trust_remote_code is True else False, trust_remote_code=True if cfg.trust_remote_code is True else False,
**model_kwargs,
) )
if not tokenizer: if not tokenizer:
try: try:
if is_llama_derived_model and "LlamaTokenizer" in globals(): if is_llama_derived_model and "LlamaTokenizer" in globals():
tokenizer = LlamaTokenizer.from_pretrained(model) tokenizer = LlamaTokenizer.from_pretrained(
model,
trust_remote_code=True if cfg.trust_remote_code is True else False,
)
else: else:
tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model) tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
model,
trust_remote_code=True if cfg.trust_remote_code is True else False,
)
except: except:
tokenizer = AutoTokenizer.from_pretrained(base_model_config, trust_remote_code=True if cfg.trust_remote_code is True else False) tokenizer = AutoTokenizer.from_pretrained(
base_model_config,
trust_remote_code=True if cfg.trust_remote_code is True else False,
)
logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}") logging.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
logging.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}") logging.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
@@ -181,14 +236,18 @@ def load_model(
tokenizer.add_special_tokens({"pad_token": "[PAD]"}) tokenizer.add_special_tokens({"pad_token": "[PAD]"})
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
if cfg.tokens: if cfg.special_tokens:
for k, v in cfg.tokens.items(): for k, v in cfg.special_tokens.items():
tokenizer.add_special_tokens({k: v}) tokenizer.add_special_tokens({k: v})
if cfg.tokens:
tokenizer.add_tokens(list(cfg.tokens))
# this should only be needed if you are messing with new tokens in the vocab embeddings_len = math.ceil(len(tokenizer) / 32) * 32
# model.resize_token_embeddings(len(tokenizer)) model.resize_token_embeddings(embeddings_len)
if cfg.adapter and load_in_8bit and not cfg.load_4bit: if (
(cfg.adapter == "lora" and load_in_8bit) or cfg.adapter == "qlora"
) and not cfg.load_4bit:
logging.info("converting PEFT model w/ prepare_model_for_int8_training") logging.info("converting PEFT model w/ prepare_model_for_int8_training")
model = prepare_model_for_int8_training(model) model = prepare_model_for_int8_training(model)
@@ -209,7 +268,11 @@ def load_model(
m.scales = m.scales.half() m.scales = m.scales.half()
m.bias = m.bias.half() m.bias = m.bias.half()
if torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) > 1 and cfg.load_4bit: if (
torch.cuda.device_count() > 1
and int(os.getenv("WORLD_SIZE", "1")) > 1
and cfg.load_4bit
):
# llama is PROBABLY model parallelizable, but the default isn't that it is # llama is PROBABLY model parallelizable, but the default isn't that it is
# so let's only set it for the 4bit, see # so let's only set it for the 4bit, see
# https://github.com/johnsmith0031/alpaca_lora_4bit/blob/08b3fca4a4a9e0d3945be1bab4529f100a428636/finetune.py#L130-L133 # https://github.com/johnsmith0031/alpaca_lora_4bit/blob/08b3fca4a4a9e0d3945be1bab4529f100a428636/finetune.py#L130-L133
@@ -222,6 +285,7 @@ def load_model(
requires_grad.append(f"{name}: {param.requires_grad}") requires_grad.append(f"{name}: {param.requires_grad}")
if len(requires_grad) == 0: if len(requires_grad) == 0:
logging.warning("there are no parameters that require gradient updates") logging.warning("there are no parameters that require gradient updates")
model.config.use_cache = False
# TODO resume_from_checkpoint handling # TODO resume_from_checkpoint handling
return model, tokenizer, lora_config return model, tokenizer, lora_config
@@ -232,7 +296,7 @@ def load_adapter(model, cfg, adapter):
if adapter is None: if adapter is None:
return model, None return model, None
if adapter == "lora": if adapter == "lora" or adapter == "qlora":
return load_lora(model, cfg) return load_lora(model, cfg)
if adapter == "llama-adapter": if adapter == "llama-adapter":
return load_llama_adapter(model, cfg) return load_llama_adapter(model, cfg)
@@ -254,7 +318,8 @@ def load_llama_adapter(model, cfg):
task_type="CAUSAL_LM", task_type="CAUSAL_LM",
) )
if cfg.peft_model_dir: if cfg.lora_model_dir:
logging.info("Loading pretained LORA")
model = PeftModel.from_pretrained( model = PeftModel.from_pretrained(
model, model,
cfg.lora_model_dir, cfg.lora_model_dir,
@@ -296,7 +361,7 @@ def load_lora(model, cfg):
model, model,
cfg.lora_model_dir, cfg.lora_model_dir,
device_map=cfg.device_map, device_map=cfg.device_map,
torch_dtype=torch.float16, # torch_dtype=torch.float16,
) )
else: else:
model = get_peft_model(model, lora_config) model = get_peft_model(model, lora_config)

View File

@@ -9,13 +9,33 @@ import torch.cuda
import transformers import transformers
from torch import nn from torch import nn
from torch.optim.lr_scheduler import OneCycleLR from torch.optim.lr_scheduler import OneCycleLR
from transformers import EarlyStoppingCallback from transformers import EarlyStoppingCallback, Trainer
from transformers.trainer_pt_utils import get_parameter_names from transformers.trainer_pt_utils import get_parameter_names
from axolotl.utils.schedulers import InterpolatingLogScheduler from axolotl.utils.schedulers import InterpolatingLogScheduler
from axolotl.utils.callbacks import SavePeftModelCallback from axolotl.utils.callbacks import SavePeftModelCallback
class OneCycleLRSchedulerTrainer(Trainer):
def create_scheduler(
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
):
optimizer = self.optimizer if optimizer is None else optimizer
num_warmup_steps = self.args.get_warmup_steps(num_training_steps)
num_training_steps = num_training_steps
pct_start = num_warmup_steps / num_training_steps
self.lr_scheduler = OneCycleLR(
optimizer,
max_lr=self.args.learning_rate,
total_steps=num_training_steps,
pct_start=pct_start,
div_factor=6,
)
return self.lr_scheduler
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer): def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
total_num_steps = int( total_num_steps = int(
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
@@ -38,6 +58,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
training_arguments_kwargs["bf16_full_eval"] = True training_arguments_kwargs["bf16_full_eval"] = True
else: else:
training_arguments_kwargs["bf16"] = cfg.bf16 training_arguments_kwargs["bf16"] = cfg.bf16
training_arguments_kwargs["fp16"] = True if cfg.fp16 and not cfg.bf16 else False
training_arguments_kwargs["tf32"] = cfg.tf32 training_arguments_kwargs["tf32"] = cfg.tf32
training_arguments_kwargs["warmup_steps"] = warmup_steps training_arguments_kwargs["warmup_steps"] = warmup_steps
training_arguments_kwargs["logging_steps"] = logging_steps training_arguments_kwargs["logging_steps"] = logging_steps
@@ -119,6 +140,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
cfg.optimizer == "adamw_bnb_8bit" cfg.optimizer == "adamw_bnb_8bit"
and not cfg.load_4bit and not cfg.load_4bit
and not "deepspeed" in training_arguments_kwargs and not "deepspeed" in training_arguments_kwargs
and not cfg.fsdp
): ):
decay_parameters = get_parameter_names(model, [nn.LayerNorm]) decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name] decay_parameters = [name for name in decay_parameters if "bias" not in name]
@@ -157,7 +179,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
cfg.learning_rate, cfg.learning_rate,
total_steps=total_num_steps, total_steps=total_num_steps,
epochs=cfg.num_epochs, epochs=cfg.num_epochs,
div_factor=10, div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6,
**lr_scheduler_kwargs, **lr_scheduler_kwargs,
) )
elif cfg.lr_scheduler == "log_sweep": elif cfg.lr_scheduler == "log_sweep":
@@ -183,7 +205,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
) )
callbacks.append(early_stop_cb) callbacks.append(early_stop_cb)
if cfg.local_rank == 0 and cfg.adapter == 'lora': # only save in rank 0 if cfg.local_rank == 0 and cfg.adapter == "lora": # only save in rank 0
callbacks.append(SavePeftModelCallback) callbacks.append(SavePeftModelCallback)
data_collator_kwargs = { data_collator_kwargs = {
@@ -194,7 +216,12 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
else: else:
data_collator_kwargs["pad_to_multiple_of"] = 8 data_collator_kwargs["pad_to_multiple_of"] = 8
trainer = transformers.Trainer( trainer_cls = (
OneCycleLRSchedulerTrainer
if cfg.lr_scheduler == "one_cycle" and cfg.fsdp
else transformers.Trainer
)
trainer = trainer_cls(
model=model, model=model,
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=eval_dataset, eval_dataset=eval_dataset,