Compare commits

..

3 Commits

Author SHA1 Message Date
Wing Lian
f6721baf10 tweak to make it work when we have no explicit test split
Some checks failed
pre-commit / pre-commit (push) Has been cancelled
PyTest / test (3.10) (push) Has been cancelled
PyTest / test (3.9) (push) Has been cancelled
2023-07-11 22:40:21 -04:00
Wing Lian
33814cc94e make sure we eval for openorca 2023-07-02 17:59:10 -04:00
Wing Lian
50254a7ccc handle orca splits 2023-07-01 07:20:23 -04:00
8 changed files with 80 additions and 206 deletions

View File

@@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
#### How to add custom prompts #### How to add custom prompts
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example. 1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`. 2. Use your custom file name as the dataset type.
Optionally, download some datasets, see [data/README.md](data/README.md) Optionally, download some datasets, see [data/README.md](data/README.md)
@@ -255,18 +255,10 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
- dataset - dataset
```yaml ```yaml
sequence_len: 2048 # max token length for prompt
# huggingface repo
datasets: datasets:
- path: vicgalle/alpaca-gpt4 - path: vicgalle/alpaca-gpt4 # local or huggingface repo
type: alpaca # format from earlier
# local
datasets:
- path: json
data_files: data.jsonl # or json
type: alpaca # format from earlier type: alpaca # format from earlier
sequence_len: 2048 # max token length / prompt
``` ```
- loading - loading
@@ -336,10 +328,10 @@ tf32: true # require >=ampere
# a list of one or more datasets to finetune the model with # a list of one or more datasets to finetune the model with
datasets: datasets:
# hf dataset repo | "json" for local dataset, make sure to fill data_files # this can be either a hf dataset, or relative path
- path: vicgalle/alpaca-gpt4 - path: vicgalle/alpaca-gpt4
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn> type: alpaca # format OR format:prompt_style (chat/instruct)
data_files: # path to source data files data_files: # path to source data files
shards: # number of shards to split data into shards: # number of shards to split data into
@@ -349,7 +341,7 @@ dataset_prepared_path: data/last_run_prepared
# push prepared dataset to hub # push prepared dataset to hub
push_dataset_to_hub: # repo path push_dataset_to_hub: # repo path
# push checkpoints to hub # push checkpoints to hub
hub_model_id: # repo path push_to_hub_model_id: # repo path
# whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets # whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
# required to be true when used in combination with `push_dataset_to_hub` # required to be true when used in combination with `push_dataset_to_hub`
hf_use_auth_token: # boolean hf_use_auth_token: # boolean

View File

@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
RUN git lfs install --skip-repo RUN git lfs install --skip-repo
RUN pip3 install awscli && \ RUN pip3 install awscli && \
# The base image ships with `pydantic==1.8.2` which is not working # The base image ships with `pydantic==1.8.2` which is not working
pip3 install -U --no-cache-dir pydantic==1.10.10 pip3 install -U --no-cache-dir pydantic

View File

@@ -37,7 +37,7 @@ from axolotl.prompters import (
def load_tokenized_prepared_datasets( def load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path split, tokenizer, cfg, default_dataset_prepared_path
) -> DatasetDict: ) -> DatasetDict:
tokenizer_name = tokenizer.__class__.__name__ tokenizer_name = tokenizer.__class__.__name__
ds_hash = str( ds_hash = str(
@@ -49,6 +49,8 @@ def load_tokenized_prepared_datasets(
sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets]) sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
) )
+ "|" + "|"
+ split
+ "|"
+ tokenizer_name + tokenizer_name
).encode("utf-8") ).encode("utf-8")
).hexdigest() ).hexdigest()
@@ -66,7 +68,7 @@ def load_tokenized_prepared_datasets(
f"{cfg.push_dataset_to_hub}/{ds_hash}", f"{cfg.push_dataset_to_hub}/{ds_hash}",
use_auth_token=use_auth_token, use_auth_token=use_auth_token,
) )
dataset = dataset["train"] dataset = dataset[split]
except Exception: # pylint: disable=broad-except # nosec except Exception: # pylint: disable=broad-except # nosec
pass pass
@@ -102,26 +104,13 @@ def load_tokenized_prepared_datasets(
pass pass
# prefer local dataset, even if hub exists # prefer local dataset, even if hub exists
local_path = Path(d.path) if Path(d.path).exists():
if local_path.exists(): ds = load_dataset(
if local_path.is_dir(): "json",
ds = load_dataset( data_files=d.path,
d.path, streaming=False,
data_files=d.data_files, split=None,
streaming=False, )
split=None,
)
elif local_path.is_file():
ds = load_dataset(
"json",
data_files=d.path,
streaming=False,
split=None,
)
else:
raise ValueError(
"unhandled dataset load: local path exists, but is neither a directory or a file"
)
elif ds_from_hub: elif ds_from_hub:
if d.data_files: if d.data_files:
ds = load_dataset( ds = load_dataset(
@@ -147,8 +136,8 @@ def load_tokenized_prepared_datasets(
raise ValueError("unhandled dataset load") raise ValueError("unhandled dataset load")
# support for using a subset of the data # support for using a subset of the data
if d.shards: if d.shards:
if "train" in ds: if split in ds:
ds = ds.shuffle(seed=seed)["train"].shard( ds = ds.shuffle(seed=seed)[split].shard(
num_shards=d.shards, index=0 num_shards=d.shards, index=0
) )
else: else:
@@ -157,8 +146,8 @@ def load_tokenized_prepared_datasets(
d_type_split = d_type.split(":") d_type_split = d_type.split(":")
d_base_type = d_type_split[0] d_base_type = d_type_split[0]
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
if "train" in ds: if split in ds:
ds = ds["train"] ds = ds[split]
if ds_strategy := load(d.type, tokenizer, cfg): if ds_strategy := load(d.type, tokenizer, cfg):
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds) ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
@@ -332,7 +321,6 @@ def load_prepare_datasets(
f"{cfg.push_dataset_to_hub}/{ds_hash}", f"{cfg.push_dataset_to_hub}/{ds_hash}",
use_auth_token=use_auth_token, use_auth_token=use_auth_token,
) )
dataset = dataset["train"]
except Exception: # pylint: disable=broad-except # nosec except Exception: # pylint: disable=broad-except # nosec
pass pass
@@ -352,28 +340,37 @@ def load_prepare_datasets(
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
) )
else: else:
dataset = load_tokenized_prepared_datasets( dataset_train = load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path "train", tokenizer, cfg, default_dataset_prepared_path
) )
dataset_test = load_tokenized_prepared_datasets(
"test", tokenizer, cfg, default_dataset_prepared_path
)
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
if cfg.seed: if cfg.seed:
dataset = dataset.shuffle(seed=cfg.seed) dataset = dataset.shuffle(seed=cfg.seed)
constant_len_dataset = ConstantLengthDataset( constant_len_dataset_train = ConstantLengthDataset(
tokenizer, tokenizer,
[dataset], [dataset["train"]],
seq_length=max_packed_sequence_len,
)
constant_len_dataset_test = ConstantLengthDataset(
tokenizer,
[dataset["test"]],
seq_length=max_packed_sequence_len, seq_length=max_packed_sequence_len,
) )
logging.info( logging.info(
f"packing master dataset to len: {cfg.max_packed_sequence_len}" f"packing master dataset to len: {cfg.max_packed_sequence_len}"
) )
dataset = Dataset.from_list(list(constant_len_dataset)) dataset_train = Dataset.from_list(list(constant_len_dataset_train))
dataset_test = Dataset.from_list(list(constant_len_dataset_test))
# filter out bad data # filter out bad data
dataset = Dataset.from_list( dataset_train = Dataset.from_list(
[ [
d d
for d in dataset for d in dataset_train
if len(d["input_ids"]) < cfg.sequence_len if len(d["input_ids"]) < cfg.sequence_len
and len(d["input_ids"]) > 0 and len(d["input_ids"]) > 0
and len(d["input_ids"]) == len(d["attention_mask"]) and len(d["input_ids"]) == len(d["attention_mask"])
@@ -381,6 +378,19 @@ def load_prepare_datasets(
] ]
) )
# filter out bad data
dataset_test = Dataset.from_list(
[
d
for d in dataset_test
if len(d["input_ids"]) < cfg.sequence_len
and len(d["input_ids"]) > 0
and len(d["input_ids"]) == len(d["attention_mask"])
and len(d["input_ids"]) == len(d["labels"])
]
)
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
if cfg.local_rank == 0: if cfg.local_rank == 0:
logging.info( logging.info(
f"Saving packed prepared dataset to disk... {prepared_ds_path}" f"Saving packed prepared dataset to disk... {prepared_ds_path}"
@@ -395,9 +405,14 @@ def load_prepare_datasets(
private=True, private=True,
) )
else: else:
# dataset_train = load_tokenized_prepared_datasets(
dataset = load_tokenized_prepared_datasets( dataset = load_tokenized_prepared_datasets(
tokenizer, cfg, default_dataset_prepared_path "train", tokenizer, cfg, default_dataset_prepared_path
) )
# dataset_test = load_tokenized_prepared_datasets(
# "test", tokenizer, cfg, default_dataset_prepared_path
# )
# dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None: if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
logging.info( logging.info(
@@ -412,6 +427,9 @@ def load_prepare_datasets(
dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False) dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
train_dataset = dataset["train"] train_dataset = dataset["train"]
eval_dataset = dataset["test"] eval_dataset = dataset["test"]
elif "train" in dataset:
train_dataset = dataset["train"]
eval_dataset = dataset["test"]
else: else:
train_dataset = dataset train_dataset = dataset
eval_dataset = None eval_dataset = None

View File

@@ -202,7 +202,7 @@ def load_model(
else True, else True,
) )
load_in_8bit = False load_in_8bit = False
elif cfg.is_llama_derived_model and not cfg.trust_remote_code: elif cfg.is_llama_derived_model:
from transformers import LlamaForCausalLM from transformers import LlamaForCausalLM
config = LlamaConfig.from_pretrained(base_model_config) config = LlamaConfig.from_pretrained(base_model_config)
@@ -241,7 +241,7 @@ def load_model(
# device=cfg.device, # device=cfg.device,
# ) # )
# model.train() # sets to train instead of eval mode # model.train() # sets to train instead of eval mode
elif model_type and not cfg.trust_remote_code: elif model_type:
model = getattr(transformers, model_type).from_pretrained( model = getattr(transformers, model_type).from_pretrained(
base_model, base_model,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,

View File

@@ -1,9 +1,6 @@
"""Module for custom LRScheduler class""" """Module for custom LRScheduler class"""
import math
from functools import partial
from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler
from torch.optim.lr_scheduler import LambdaLR, LRScheduler
class InterpolatingLogScheduler(LRScheduler): class InterpolatingLogScheduler(LRScheduler):
@@ -45,58 +42,3 @@ class InterpolatingLogScheduler(LRScheduler):
lrs = [self.max_lr for base_lr in self.base_lrs] lrs = [self.max_lr for base_lr in self.base_lrs]
return lrs return lrs
def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
current_step: int,
*,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float
):
if current_step < num_warmup_steps:
return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
progress = float(current_step - num_warmup_steps) / float(
max(1, num_training_steps - num_warmup_steps)
)
return max(
0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
)
def get_cosine_schedule_with_quadratic_warmup(
optimizer: Optimizer,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float = 0.5,
last_epoch: int = -1,
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
num_cycles (`float`, *optional*, defaults to 0.5):
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
following a half-cosine).
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
lr_lambda = partial(
_get_cosine_schedule_with_quadratic_warmup_lr_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)

View File

@@ -5,82 +5,25 @@ import logging
import math import math
import os import os
import sys import sys
from dataclasses import field
from pathlib import Path from pathlib import Path
from typing import Any, Dict, Optional from typing import Optional
import bitsandbytes as bnb import bitsandbytes as bnb
import torch.cuda import torch.cuda
import torch.nn.functional as F
import transformers import transformers
from torch import nn from torch import nn
from torch.optim.lr_scheduler import OneCycleLR from torch.optim.lr_scheduler import OneCycleLR
from transformers import ( from transformers import EarlyStoppingCallback, Trainer
EarlyStoppingCallback,
EvalPrediction,
Trainer,
TrainingArguments,
)
from transformers.trainer_pt_utils import get_parameter_names from transformers.trainer_pt_utils import get_parameter_names
from axolotl.utils.callbacks import ( from axolotl.utils.callbacks import (
SaveBetterTransformerModelCallback, SaveBetterTransformerModelCallback,
SavePeftModelCallback, SavePeftModelCallback,
) )
from axolotl.utils.schedulers import ( from axolotl.utils.schedulers import InterpolatingLogScheduler
InterpolatingLogScheduler,
get_cosine_schedule_with_quadratic_warmup,
)
class AxolotlTrainingArguments(TrainingArguments): class OneCycleLRSchedulerTrainer(Trainer):
"""
Extend the base TrainingArguments for axolotl helpers
"""
lr_quadratic_warmup: bool = field(
default=False,
metadata={"help": "Use quadratic warmup for cosine scheduling."},
)
class AxolotlTrainer(Trainer):
"""
Extend the base Trainer for axolotl helpers
"""
args = None # type: AxolotlTrainingArguments
def create_scheduler(
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
):
"""
Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
passed as an argument.
Args:
num_training_steps (int): The number of training steps to do.
optimizer (torch.optim.Optimizer): The training optimizer
"""
# fmt: off
if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
# fmt: on
if (
self.args.lr_scheduler_type == "cosine"
and self.args.lr_quadratic_warmup is True
):
self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
)
else:
return super().create_scheduler(num_training_steps, optimizer)
return self.lr_scheduler
class OneCycleLRSchedulerTrainer(AxolotlTrainer):
""" """
Trainer subclass that uses the OneCycleLR scheduler Trainer subclass that uses the OneCycleLR scheduler
""" """
@@ -160,9 +103,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
if cfg.fsdp_config: if cfg.fsdp_config:
training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config) training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)
if cfg.lr_quadratic_warmup is not None:
training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
# deepspeed # deepspeed
if ( if (
os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
@@ -184,11 +124,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
if cfg.max_grad_norm: if cfg.max_grad_norm:
training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
if cfg.hub_model_id: if cfg.push_to_hub_model_id:
training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id
training_arguments_kwargs["push_to_hub"] = True training_arguments_kwargs["push_to_hub"] = True
training_args = AxolotlTrainingArguments( training_args = transformers.TrainingArguments(
per_device_train_batch_size=cfg.micro_batch_size, per_device_train_batch_size=cfg.micro_batch_size,
per_device_eval_batch_size=cfg.eval_batch_size per_device_eval_batch_size=cfg.eval_batch_size
if cfg.eval_batch_size is not None if cfg.eval_batch_size is not None
@@ -197,9 +137,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
eval_accumulation_steps=cfg.gradient_accumulation_steps, eval_accumulation_steps=cfg.gradient_accumulation_steps,
num_train_epochs=cfg.num_epochs, num_train_epochs=cfg.num_epochs,
learning_rate=cfg.learning_rate, learning_rate=cfg.learning_rate,
evaluation_strategy="steps" if cfg.val_set_size > 0 else "no", evaluation_strategy="steps",
save_strategy="steps" if cfg.save_steps else "epoch", save_strategy="steps" if cfg.save_steps else "epoch",
eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None, eval_steps=cfg.eval_steps,
save_steps=cfg.save_steps, save_steps=cfg.save_steps,
output_dir=cfg.output_dir, output_dir=cfg.output_dir,
save_total_limit=3, save_total_limit=3,
@@ -335,23 +275,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
num_proc=32, num_proc=32,
) )
if cfg.compute_perplexity_metrics:
def compute_metrics(eval_preds: EvalPrediction) -> Dict[str, Any]:
logits = eval_preds.predictions
labels = eval_preds.label_ids
cross_entropy_loss = F.cross_entropy(
logits.view(-1, model.config.vocab_size), labels.view(-1)
)
perplexity = torch.exp(cross_entropy_loss)
return {"cross_entropy_loss": cross_entropy_loss, "perplexity": perplexity}
trainer_kwargs["compute_metrics"] = compute_metrics
trainer_cls = ( trainer_cls = (
OneCycleLRSchedulerTrainer OneCycleLRSchedulerTrainer
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora") if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
else AxolotlTrainer else transformers.Trainer
) )
trainer = trainer_cls( trainer = trainer_cls(
model=model, model=model,

View File

@@ -87,16 +87,11 @@ def validate_config(cfg):
"You probably want to disable group_by_length as it will force a streamed dataset to download completely." "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
) )
if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and ( if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
not cfg.optimizer or "adamw" not in cfg.optimizer not cfg.optimizer or "adamw" not in cfg.optimizer
): ):
logging.warning("adamw hyperparameters found, but no adamw optimizer set") logging.warning("adamw hyperparameters found, but no adamw optimizer set")
if cfg.push_to_hub_model_id:
raise ValueError(
"push_to_hub_model_id is deprecated. Please use hub_model_id instead."
)
# TODO # TODO
# MPT 7b # MPT 7b
# https://github.com/facebookresearch/bitsandbytes/issues/25 # https://github.com/facebookresearch/bitsandbytes/issues/25

View File

@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
cfg = DictDefault( cfg = DictDefault(
{ {
"optimizer": None, "optimizer": None,
"adam_epsilon": 0.0001, "adamw_epsilon": 0.0001,
} }
) )
@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
cfg = DictDefault( cfg = DictDefault(
{ {
"optimizer": "adafactor", "optimizer": "adafactor",
"adam_beta1": 0.0001, "adamw_beta1": 0.0001,
} }
) )
@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
cfg = DictDefault( cfg = DictDefault(
{ {
"optimizer": "adamw_bnb_8bit", "optimizer": "adamw_bnb_8bit",
"adam_beta1": 0.9, "adamw_beta1": 0.0001,
"adam_beta2": 0.99, "adamw_beta2": 0.0001,
"adam_epsilon": 0.0001, "adamw_epsilon": 0.0001,
} }
) )