Compare commits

..

3 Commits

Author SHA1 Message Date
Wing Lian
8028652b8f fix attetion mask with packing
Some checks failed
pre-commit / pre-commit (push) Has been cancelled
PyTest / test (3.10) (push) Has been cancelled
PyTest / test (3.9) (push) Has been cancelled
2023-07-15 10:38:01 -04:00
Wing Lian
33814cc94e make sure we eval for openorca 2023-07-02 17:59:10 -04:00
Wing Lian
50254a7ccc handle orca splits 2023-07-01 07:20:23 -04:00
10 changed files with 86 additions and 215 deletions

View File

@@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
#### How to add custom prompts
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
2. Use your custom file name as the dataset type.
Optionally, download some datasets, see [data/README.md](data/README.md)
@@ -255,18 +255,10 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
- dataset
```yaml
sequence_len: 2048 # max token length for prompt
# huggingface repo
datasets:
- path: vicgalle/alpaca-gpt4
type: alpaca # format from earlier
# local
datasets:
- path: json
data_files: data.jsonl # or json
- path: vicgalle/alpaca-gpt4 # local or huggingface repo
type: alpaca # format from earlier
sequence_len: 2048 # max token length / prompt
```
- loading
@@ -336,10 +328,10 @@ tf32: true # require >=ampere
# a list of one or more datasets to finetune the model with
datasets:
# hf dataset repo | "json" for local dataset, make sure to fill data_files
# this can be either a hf dataset, or relative path
- path: vicgalle/alpaca-gpt4
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
type: alpaca # format OR format:prompt_style (chat/instruct)
data_files: # path to source data files
shards: # number of shards to split data into
@@ -349,7 +341,7 @@ dataset_prepared_path: data/last_run_prepared
# push prepared dataset to hub
push_dataset_to_hub: # repo path
# push checkpoints to hub
hub_model_id: # repo path
push_to_hub_model_id: # repo path
# whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
# required to be true when used in combination with `push_dataset_to_hub`
hf_use_auth_token: # boolean

View File

@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
RUN git lfs install --skip-repo
RUN pip3 install awscli && \
# The base image ships with `pydantic==1.8.2` which is not working
pip3 install -U --no-cache-dir pydantic==1.10.10
pip3 install -U --no-cache-dir pydantic

View File

@@ -79,11 +79,13 @@ class ConstantLengthDataset(IterableDataset):
buffer = {"input_ids": [], "attention_mask": [], "labels": []}
buffer_len = 0
for dataset in self.datasets:
idx = 0
iterator = iter(dataset)
more_examples = True
while more_examples:
try:
example = next(iterator)
idx += 1
except StopIteration:
more_examples = False
example = None
@@ -124,6 +126,7 @@ class ConstantLengthDataset(IterableDataset):
"labels": [],
}
buffer_len = 0
idx = 1
if example:
# FIXME
@@ -132,11 +135,6 @@ class ConstantLengthDataset(IterableDataset):
input_ids = example["input_ids"]
attention_mask = example["attention_mask"]
labels = example["labels"]
if (
buffer["input_ids"]
and input_ids[0] == self.tokenizer.bos_token_id
):
attention_mask[0] = 0
if add_concat_token:
input_ids.append(self.concat_token_id)
@@ -147,7 +145,7 @@ class ConstantLengthDataset(IterableDataset):
input_ids, dtype=self.tokens_dtype
)
attention_mask_with_concat = torch.tensor(
attention_mask, dtype=self.tokens_dtype
[idx * m for m in attention_mask], dtype=torch.int16
)
labels_with_concat = torch.tensor(
labels, dtype=self.tokens_dtype

View File

@@ -37,7 +37,7 @@ from axolotl.prompters import (
def load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path
split, tokenizer, cfg, default_dataset_prepared_path
) -> DatasetDict:
tokenizer_name = tokenizer.__class__.__name__
ds_hash = str(
@@ -49,6 +49,8 @@ def load_tokenized_prepared_datasets(
sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
)
+ "|"
+ split
+ "|"
+ tokenizer_name
).encode("utf-8")
).hexdigest()
@@ -66,7 +68,7 @@ def load_tokenized_prepared_datasets(
f"{cfg.push_dataset_to_hub}/{ds_hash}",
use_auth_token=use_auth_token,
)
dataset = dataset["train"]
dataset = dataset[split]
except Exception: # pylint: disable=broad-except # nosec
pass
@@ -102,26 +104,13 @@ def load_tokenized_prepared_datasets(
pass
# prefer local dataset, even if hub exists
local_path = Path(d.path)
if local_path.exists():
if local_path.is_dir():
ds = load_dataset(
d.path,
data_files=d.data_files,
streaming=False,
split=None,
)
elif local_path.is_file():
ds = load_dataset(
"json",
data_files=d.path,
streaming=False,
split=None,
)
else:
raise ValueError(
"unhandled dataset load: local path exists, but is neither a directory or a file"
)
if Path(d.path).exists():
ds = load_dataset(
"json",
data_files=d.path,
streaming=False,
split=None,
)
elif ds_from_hub:
if d.data_files:
ds = load_dataset(
@@ -147,8 +136,8 @@ def load_tokenized_prepared_datasets(
raise ValueError("unhandled dataset load")
# support for using a subset of the data
if d.shards:
if "train" in ds:
ds = ds.shuffle(seed=seed)["train"].shard(
if split in ds:
ds = ds.shuffle(seed=seed)[split].shard(
num_shards=d.shards, index=0
)
else:
@@ -157,8 +146,8 @@ def load_tokenized_prepared_datasets(
d_type_split = d_type.split(":")
d_base_type = d_type_split[0]
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
if "train" in ds:
ds = ds["train"]
if split in ds:
ds = ds[split]
if ds_strategy := load(d.type, tokenizer, cfg):
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
datasets.append(ds_wrapper)
@@ -332,7 +321,6 @@ def load_prepare_datasets(
f"{cfg.push_dataset_to_hub}/{ds_hash}",
use_auth_token=use_auth_token,
)
dataset = dataset["train"]
except Exception: # pylint: disable=broad-except # nosec
pass
@@ -352,28 +340,37 @@ def load_prepare_datasets(
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
)
else:
dataset = load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path
dataset_train = load_tokenized_prepared_datasets(
"train", tokenizer, cfg, default_dataset_prepared_path
)
dataset_test = load_tokenized_prepared_datasets(
"test", tokenizer, cfg, default_dataset_prepared_path
)
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
if cfg.seed:
dataset = dataset.shuffle(seed=cfg.seed)
constant_len_dataset = ConstantLengthDataset(
constant_len_dataset_train = ConstantLengthDataset(
tokenizer,
[dataset],
[dataset["train"]],
seq_length=max_packed_sequence_len,
)
constant_len_dataset_test = ConstantLengthDataset(
tokenizer,
[dataset["test"]],
seq_length=max_packed_sequence_len,
)
logging.info(
f"packing master dataset to len: {cfg.max_packed_sequence_len}"
)
dataset = Dataset.from_list(list(constant_len_dataset))
dataset_train = Dataset.from_list(list(constant_len_dataset_train))
dataset_test = Dataset.from_list(list(constant_len_dataset_test))
# filter out bad data
dataset = Dataset.from_list(
dataset_train = Dataset.from_list(
[
d
for d in dataset
for d in dataset_train
if len(d["input_ids"]) < cfg.sequence_len
and len(d["input_ids"]) > 0
and len(d["input_ids"]) == len(d["attention_mask"])
@@ -381,6 +378,19 @@ def load_prepare_datasets(
]
)
# filter out bad data
dataset_test = Dataset.from_list(
[
d
for d in dataset_test
if len(d["input_ids"]) < cfg.sequence_len
and len(d["input_ids"]) > 0
and len(d["input_ids"]) == len(d["attention_mask"])
and len(d["input_ids"]) == len(d["labels"])
]
)
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
if cfg.local_rank == 0:
logging.info(
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
@@ -395,9 +405,13 @@ def load_prepare_datasets(
private=True,
)
else:
dataset = load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path
dataset_train = load_tokenized_prepared_datasets(
"train", tokenizer, cfg, default_dataset_prepared_path
)
dataset_test = load_tokenized_prepared_datasets(
"test", tokenizer, cfg, default_dataset_prepared_path
)
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
logging.info(
@@ -412,6 +426,9 @@ def load_prepare_datasets(
dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
elif "train" in dataset:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
else:
train_dataset = dataset
eval_dataset = None

View File

@@ -202,7 +202,7 @@ def load_model(
else True,
)
load_in_8bit = False
elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
elif cfg.is_llama_derived_model:
from transformers import LlamaForCausalLM
config = LlamaConfig.from_pretrained(base_model_config)
@@ -241,7 +241,7 @@ def load_model(
# device=cfg.device,
# )
# model.train() # sets to train instead of eval mode
elif model_type and not cfg.trust_remote_code:
elif model_type:
model = getattr(transformers, model_type).from_pretrained(
base_model,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,

View File

@@ -1,9 +1,6 @@
"""Module for custom LRScheduler class"""
import math
from functools import partial
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR, LRScheduler
from torch.optim.lr_scheduler import LRScheduler
class InterpolatingLogScheduler(LRScheduler):
@@ -45,58 +42,3 @@ class InterpolatingLogScheduler(LRScheduler):
lrs = [self.max_lr for base_lr in self.base_lrs]
return lrs
def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
current_step: int,
*,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float
):
if current_step < num_warmup_steps:
return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
progress = float(current_step - num_warmup_steps) / float(
max(1, num_training_steps - num_warmup_steps)
)
return max(
0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
)
def get_cosine_schedule_with_quadratic_warmup(
optimizer: Optimizer,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float = 0.5,
last_epoch: int = -1,
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
num_cycles (`float`, *optional*, defaults to 0.5):
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
following a half-cosine).
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
lr_lambda = partial(
_get_cosine_schedule_with_quadratic_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)

View File

@@ -5,82 +5,25 @@ import logging
import math
import os
import sys
from dataclasses import field
from pathlib import Path
from typing import Any, Dict, Optional
from typing import Optional
import bitsandbytes as bnb
import torch.cuda
import torch.nn.functional as F
import transformers
from torch import nn
from torch.optim.lr_scheduler import OneCycleLR
from transformers import (
EarlyStoppingCallback,
EvalPrediction,
Trainer,
TrainingArguments,
)
from transformers import EarlyStoppingCallback, Trainer
from transformers.trainer_pt_utils import get_parameter_names
from axolotl.utils.callbacks import (
SaveBetterTransformerModelCallback,
SavePeftModelCallback,
)
from axolotl.utils.schedulers import (
InterpolatingLogScheduler,
get_cosine_schedule_with_quadratic_warmup,
)
from axolotl.utils.schedulers import InterpolatingLogScheduler
class AxolotlTrainingArguments(TrainingArguments):
"""
Extend the base TrainingArguments for axolotl helpers
"""
lr_quadratic_warmup: bool = field(
default=False,
metadata={"help": "Use quadratic warmup for cosine scheduling."},
)
class AxolotlTrainer(Trainer):
"""
Extend the base Trainer for axolotl helpers
"""
args = None # type: AxolotlTrainingArguments
def create_scheduler(
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
):
"""
Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
passed as an argument.
Args:
num_training_steps (int): The number of training steps to do.
optimizer (torch.optim.Optimizer): The training optimizer
"""
# fmt: off
if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
# fmt: on
if (
self.args.lr_scheduler_type == "cosine"
and self.args.lr_quadratic_warmup is True
):
self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
)
else:
return super().create_scheduler(num_training_steps, optimizer)
return self.lr_scheduler
class OneCycleLRSchedulerTrainer(AxolotlTrainer):
class OneCycleLRSchedulerTrainer(Trainer):
"""
Trainer subclass that uses the OneCycleLR scheduler
"""
@@ -160,9 +103,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
if cfg.fsdp_config:
training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)
if cfg.lr_quadratic_warmup is not None:
training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
# deepspeed
if (
os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -184,11 +124,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
if cfg.max_grad_norm:
training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
if cfg.hub_model_id:
training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
if cfg.push_to_hub_model_id:
training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id
training_arguments_kwargs["push_to_hub"] = True
training_args = AxolotlTrainingArguments(
training_args = transformers.TrainingArguments(
per_device_train_batch_size=cfg.micro_batch_size,
per_device_eval_batch_size=cfg.eval_batch_size
if cfg.eval_batch_size is not None
@@ -197,9 +137,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
eval_accumulation_steps=cfg.gradient_accumulation_steps,
num_train_epochs=cfg.num_epochs,
learning_rate=cfg.learning_rate,
evaluation_strategy="steps" if cfg.val_set_size > 0 else "no",
evaluation_strategy="steps",
save_strategy="steps" if cfg.save_steps else "epoch",
eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None,
eval_steps=cfg.eval_steps,
save_steps=cfg.save_steps,
output_dir=cfg.output_dir,
save_total_limit=3,
@@ -335,23 +275,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
num_proc=32,
)
if cfg.compute_perplexity_metrics:
def compute_metrics(eval_preds: EvalPrediction) -> Dict[str, Any]:
logits = eval_preds.predictions
labels = eval_preds.label_ids
cross_entropy_loss = F.cross_entropy(
logits.view(-1, model.config.vocab_size), labels.view(-1)
)
perplexity = torch.exp(cross_entropy_loss)
return {"cross_entropy_loss": cross_entropy_loss, "perplexity": perplexity}
trainer_kwargs["compute_metrics"] = compute_metrics
trainer_cls = (
OneCycleLRSchedulerTrainer
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
else AxolotlTrainer
else transformers.Trainer
)
trainer = trainer_cls(
model=model,

View File

@@ -87,16 +87,11 @@ def validate_config(cfg):
"You probably want to disable group_by_length as it will force a streamed dataset to download completely."
)
if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
not cfg.optimizer or "adamw" not in cfg.optimizer
):
logging.warning("adamw hyperparameters found, but no adamw optimizer set")
if cfg.push_to_hub_model_id:
raise ValueError(
"push_to_hub_model_id is deprecated. Please use hub_model_id instead."
)
# TODO
# MPT 7b
# https://github.com/facebookresearch/bitsandbytes/issues/25

View File

@@ -27,7 +27,7 @@ class TestPacking(unittest.TestCase):
}
)
def test_resets_attention(self):
def test_increments_attention(self):
prompter = AlpacaPrompter("chat")
strat = AlpacaPromptTokenizingStrategy(
prompter,
@@ -58,7 +58,7 @@ class TestPacking(unittest.TestCase):
# but subsequent one does
assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
assert example["attention_mask"][next_bos_index] == 0
assert example["attention_mask"][next_bos_index] == 2
if __name__ == "__main__":

View File

@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
cfg = DictDefault(
{
"optimizer": None,
"adam_epsilon": 0.0001,
"adamw_epsilon": 0.0001,
}
)
@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
cfg = DictDefault(
{
"optimizer": "adafactor",
"adam_beta1": 0.0001,
"adamw_beta1": 0.0001,
}
)
@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
cfg = DictDefault(
{
"optimizer": "adamw_bnb_8bit",
"adam_beta1": 0.9,
"adam_beta2": 0.99,
"adam_epsilon": 0.0001,
"adamw_beta1": 0.0001,
"adamw_beta2": 0.0001,
"adamw_epsilon": 0.0001,
}
)