Compare commits
3 Commits
compute-pe
...
openorca-f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8028652b8f | ||
|
|
33814cc94e | ||
|
|
50254a7ccc |
20
README.md
20
README.md
@@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
|
||||
#### How to add custom prompts
|
||||
|
||||
1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
|
||||
2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
|
||||
2. Use your custom file name as the dataset type.
|
||||
|
||||
Optionally, download some datasets, see [data/README.md](data/README.md)
|
||||
|
||||
@@ -255,18 +255,10 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
||||
|
||||
- dataset
|
||||
```yaml
|
||||
sequence_len: 2048 # max token length for prompt
|
||||
|
||||
# huggingface repo
|
||||
datasets:
|
||||
- path: vicgalle/alpaca-gpt4
|
||||
type: alpaca # format from earlier
|
||||
|
||||
# local
|
||||
datasets:
|
||||
- path: json
|
||||
data_files: data.jsonl # or json
|
||||
- path: vicgalle/alpaca-gpt4 # local or huggingface repo
|
||||
type: alpaca # format from earlier
|
||||
sequence_len: 2048 # max token length / prompt
|
||||
```
|
||||
|
||||
- loading
|
||||
@@ -336,10 +328,10 @@ tf32: true # require >=ampere
|
||||
|
||||
# a list of one or more datasets to finetune the model with
|
||||
datasets:
|
||||
# hf dataset repo | "json" for local dataset, make sure to fill data_files
|
||||
# this can be either a hf dataset, or relative path
|
||||
- path: vicgalle/alpaca-gpt4
|
||||
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||
type: alpaca # format OR format:prompt_style (chat/instruct)
|
||||
data_files: # path to source data files
|
||||
shards: # number of shards to split data into
|
||||
|
||||
@@ -349,7 +341,7 @@ dataset_prepared_path: data/last_run_prepared
|
||||
# push prepared dataset to hub
|
||||
push_dataset_to_hub: # repo path
|
||||
# push checkpoints to hub
|
||||
hub_model_id: # repo path
|
||||
push_to_hub_model_id: # repo path
|
||||
# whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
|
||||
# required to be true when used in combination with `push_dataset_to_hub`
|
||||
hf_use_auth_token: # boolean
|
||||
|
||||
@@ -97,4 +97,4 @@ RUN cd /workspace/builds/bitsandbytes && python3 setup.py install
|
||||
RUN git lfs install --skip-repo
|
||||
RUN pip3 install awscli && \
|
||||
# The base image ships with `pydantic==1.8.2` which is not working
|
||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||
pip3 install -U --no-cache-dir pydantic
|
||||
|
||||
@@ -79,11 +79,13 @@ class ConstantLengthDataset(IterableDataset):
|
||||
buffer = {"input_ids": [], "attention_mask": [], "labels": []}
|
||||
buffer_len = 0
|
||||
for dataset in self.datasets:
|
||||
idx = 0
|
||||
iterator = iter(dataset)
|
||||
more_examples = True
|
||||
while more_examples:
|
||||
try:
|
||||
example = next(iterator)
|
||||
idx += 1
|
||||
except StopIteration:
|
||||
more_examples = False
|
||||
example = None
|
||||
@@ -124,6 +126,7 @@ class ConstantLengthDataset(IterableDataset):
|
||||
"labels": [],
|
||||
}
|
||||
buffer_len = 0
|
||||
idx = 1
|
||||
|
||||
if example:
|
||||
# FIXME
|
||||
@@ -132,11 +135,6 @@ class ConstantLengthDataset(IterableDataset):
|
||||
input_ids = example["input_ids"]
|
||||
attention_mask = example["attention_mask"]
|
||||
labels = example["labels"]
|
||||
if (
|
||||
buffer["input_ids"]
|
||||
and input_ids[0] == self.tokenizer.bos_token_id
|
||||
):
|
||||
attention_mask[0] = 0
|
||||
|
||||
if add_concat_token:
|
||||
input_ids.append(self.concat_token_id)
|
||||
@@ -147,7 +145,7 @@ class ConstantLengthDataset(IterableDataset):
|
||||
input_ids, dtype=self.tokens_dtype
|
||||
)
|
||||
attention_mask_with_concat = torch.tensor(
|
||||
attention_mask, dtype=self.tokens_dtype
|
||||
[idx * m for m in attention_mask], dtype=torch.int16
|
||||
)
|
||||
labels_with_concat = torch.tensor(
|
||||
labels, dtype=self.tokens_dtype
|
||||
|
||||
@@ -37,7 +37,7 @@ from axolotl.prompters import (
|
||||
|
||||
|
||||
def load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path
|
||||
split, tokenizer, cfg, default_dataset_prepared_path
|
||||
) -> DatasetDict:
|
||||
tokenizer_name = tokenizer.__class__.__name__
|
||||
ds_hash = str(
|
||||
@@ -49,6 +49,8 @@ def load_tokenized_prepared_datasets(
|
||||
sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
|
||||
)
|
||||
+ "|"
|
||||
+ split
|
||||
+ "|"
|
||||
+ tokenizer_name
|
||||
).encode("utf-8")
|
||||
).hexdigest()
|
||||
@@ -66,7 +68,7 @@ def load_tokenized_prepared_datasets(
|
||||
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
||||
use_auth_token=use_auth_token,
|
||||
)
|
||||
dataset = dataset["train"]
|
||||
dataset = dataset[split]
|
||||
except Exception: # pylint: disable=broad-except # nosec
|
||||
pass
|
||||
|
||||
@@ -102,26 +104,13 @@ def load_tokenized_prepared_datasets(
|
||||
pass
|
||||
|
||||
# prefer local dataset, even if hub exists
|
||||
local_path = Path(d.path)
|
||||
if local_path.exists():
|
||||
if local_path.is_dir():
|
||||
ds = load_dataset(
|
||||
d.path,
|
||||
data_files=d.data_files,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
elif local_path.is_file():
|
||||
ds = load_dataset(
|
||||
"json",
|
||||
data_files=d.path,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
||||
)
|
||||
if Path(d.path).exists():
|
||||
ds = load_dataset(
|
||||
"json",
|
||||
data_files=d.path,
|
||||
streaming=False,
|
||||
split=None,
|
||||
)
|
||||
elif ds_from_hub:
|
||||
if d.data_files:
|
||||
ds = load_dataset(
|
||||
@@ -147,8 +136,8 @@ def load_tokenized_prepared_datasets(
|
||||
raise ValueError("unhandled dataset load")
|
||||
# support for using a subset of the data
|
||||
if d.shards:
|
||||
if "train" in ds:
|
||||
ds = ds.shuffle(seed=seed)["train"].shard(
|
||||
if split in ds:
|
||||
ds = ds.shuffle(seed=seed)[split].shard(
|
||||
num_shards=d.shards, index=0
|
||||
)
|
||||
else:
|
||||
@@ -157,8 +146,8 @@ def load_tokenized_prepared_datasets(
|
||||
d_type_split = d_type.split(":")
|
||||
d_base_type = d_type_split[0]
|
||||
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
|
||||
if "train" in ds:
|
||||
ds = ds["train"]
|
||||
if split in ds:
|
||||
ds = ds[split]
|
||||
if ds_strategy := load(d.type, tokenizer, cfg):
|
||||
ds_wrapper = TokenizedPromptDataset(ds_strategy, ds)
|
||||
datasets.append(ds_wrapper)
|
||||
@@ -332,7 +321,6 @@ def load_prepare_datasets(
|
||||
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
||||
use_auth_token=use_auth_token,
|
||||
)
|
||||
dataset = dataset["train"]
|
||||
except Exception: # pylint: disable=broad-except # nosec
|
||||
pass
|
||||
|
||||
@@ -352,28 +340,37 @@ def load_prepare_datasets(
|
||||
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
|
||||
)
|
||||
else:
|
||||
dataset = load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path
|
||||
dataset_train = load_tokenized_prepared_datasets(
|
||||
"train", tokenizer, cfg, default_dataset_prepared_path
|
||||
)
|
||||
|
||||
dataset_test = load_tokenized_prepared_datasets(
|
||||
"test", tokenizer, cfg, default_dataset_prepared_path
|
||||
)
|
||||
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
|
||||
if cfg.seed:
|
||||
dataset = dataset.shuffle(seed=cfg.seed)
|
||||
|
||||
constant_len_dataset = ConstantLengthDataset(
|
||||
constant_len_dataset_train = ConstantLengthDataset(
|
||||
tokenizer,
|
||||
[dataset],
|
||||
[dataset["train"]],
|
||||
seq_length=max_packed_sequence_len,
|
||||
)
|
||||
constant_len_dataset_test = ConstantLengthDataset(
|
||||
tokenizer,
|
||||
[dataset["test"]],
|
||||
seq_length=max_packed_sequence_len,
|
||||
)
|
||||
logging.info(
|
||||
f"packing master dataset to len: {cfg.max_packed_sequence_len}"
|
||||
)
|
||||
dataset = Dataset.from_list(list(constant_len_dataset))
|
||||
dataset_train = Dataset.from_list(list(constant_len_dataset_train))
|
||||
dataset_test = Dataset.from_list(list(constant_len_dataset_test))
|
||||
|
||||
# filter out bad data
|
||||
dataset = Dataset.from_list(
|
||||
dataset_train = Dataset.from_list(
|
||||
[
|
||||
d
|
||||
for d in dataset
|
||||
for d in dataset_train
|
||||
if len(d["input_ids"]) < cfg.sequence_len
|
||||
and len(d["input_ids"]) > 0
|
||||
and len(d["input_ids"]) == len(d["attention_mask"])
|
||||
@@ -381,6 +378,19 @@ def load_prepare_datasets(
|
||||
]
|
||||
)
|
||||
|
||||
# filter out bad data
|
||||
dataset_test = Dataset.from_list(
|
||||
[
|
||||
d
|
||||
for d in dataset_test
|
||||
if len(d["input_ids"]) < cfg.sequence_len
|
||||
and len(d["input_ids"]) > 0
|
||||
and len(d["input_ids"]) == len(d["attention_mask"])
|
||||
and len(d["input_ids"]) == len(d["labels"])
|
||||
]
|
||||
)
|
||||
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
logging.info(
|
||||
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
|
||||
@@ -395,9 +405,13 @@ def load_prepare_datasets(
|
||||
private=True,
|
||||
)
|
||||
else:
|
||||
dataset = load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path
|
||||
dataset_train = load_tokenized_prepared_datasets(
|
||||
"train", tokenizer, cfg, default_dataset_prepared_path
|
||||
)
|
||||
dataset_test = load_tokenized_prepared_datasets(
|
||||
"test", tokenizer, cfg, default_dataset_prepared_path
|
||||
)
|
||||
dataset = DatasetDict({"train": dataset_train, "test": dataset_test})
|
||||
|
||||
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
||||
logging.info(
|
||||
@@ -412,6 +426,9 @@ def load_prepare_datasets(
|
||||
dataset = dataset.train_test_split(test_size=cfg.val_set_size, shuffle=False)
|
||||
train_dataset = dataset["train"]
|
||||
eval_dataset = dataset["test"]
|
||||
elif "train" in dataset:
|
||||
train_dataset = dataset["train"]
|
||||
eval_dataset = dataset["test"]
|
||||
else:
|
||||
train_dataset = dataset
|
||||
eval_dataset = None
|
||||
|
||||
@@ -202,7 +202,7 @@ def load_model(
|
||||
else True,
|
||||
)
|
||||
load_in_8bit = False
|
||||
elif cfg.is_llama_derived_model and not cfg.trust_remote_code:
|
||||
elif cfg.is_llama_derived_model:
|
||||
from transformers import LlamaForCausalLM
|
||||
|
||||
config = LlamaConfig.from_pretrained(base_model_config)
|
||||
@@ -241,7 +241,7 @@ def load_model(
|
||||
# device=cfg.device,
|
||||
# )
|
||||
# model.train() # sets to train instead of eval mode
|
||||
elif model_type and not cfg.trust_remote_code:
|
||||
elif model_type:
|
||||
model = getattr(transformers, model_type).from_pretrained(
|
||||
base_model,
|
||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
"""Module for custom LRScheduler class"""
|
||||
import math
|
||||
from functools import partial
|
||||
|
||||
from torch.optim import Optimizer
|
||||
from torch.optim.lr_scheduler import LambdaLR, LRScheduler
|
||||
from torch.optim.lr_scheduler import LRScheduler
|
||||
|
||||
|
||||
class InterpolatingLogScheduler(LRScheduler):
|
||||
@@ -45,58 +42,3 @@ class InterpolatingLogScheduler(LRScheduler):
|
||||
lrs = [self.max_lr for base_lr in self.base_lrs]
|
||||
|
||||
return lrs
|
||||
|
||||
|
||||
def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
|
||||
current_step: int,
|
||||
*,
|
||||
num_warmup_steps: int,
|
||||
num_training_steps: int,
|
||||
num_cycles: float
|
||||
):
|
||||
if current_step < num_warmup_steps:
|
||||
return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
|
||||
progress = float(current_step - num_warmup_steps) / float(
|
||||
max(1, num_training_steps - num_warmup_steps)
|
||||
)
|
||||
return max(
|
||||
0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
|
||||
)
|
||||
|
||||
|
||||
def get_cosine_schedule_with_quadratic_warmup(
|
||||
optimizer: Optimizer,
|
||||
num_warmup_steps: int,
|
||||
num_training_steps: int,
|
||||
num_cycles: float = 0.5,
|
||||
last_epoch: int = -1,
|
||||
):
|
||||
"""
|
||||
Create a schedule with a learning rate that decreases following the values of the cosine function between the
|
||||
initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
|
||||
initial lr set in the optimizer.
|
||||
|
||||
Args:
|
||||
optimizer ([`~torch.optim.Optimizer`]):
|
||||
The optimizer for which to schedule the learning rate.
|
||||
num_warmup_steps (`int`):
|
||||
The number of steps for the warmup phase.
|
||||
num_training_steps (`int`):
|
||||
The total number of training steps.
|
||||
num_cycles (`float`, *optional*, defaults to 0.5):
|
||||
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
|
||||
following a half-cosine).
|
||||
last_epoch (`int`, *optional*, defaults to -1):
|
||||
The index of the last epoch when resuming training.
|
||||
|
||||
Return:
|
||||
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
|
||||
"""
|
||||
|
||||
lr_lambda = partial(
|
||||
_get_cosine_schedule_with_quadratic_warmup_lr_lambda,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
num_training_steps=num_training_steps,
|
||||
num_cycles=num_cycles,
|
||||
)
|
||||
return LambdaLR(optimizer, lr_lambda, last_epoch)
|
||||
|
||||
@@ -5,82 +5,25 @@ import logging
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
from typing import Optional
|
||||
|
||||
import bitsandbytes as bnb
|
||||
import torch.cuda
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from torch import nn
|
||||
from torch.optim.lr_scheduler import OneCycleLR
|
||||
from transformers import (
|
||||
EarlyStoppingCallback,
|
||||
EvalPrediction,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
)
|
||||
from transformers import EarlyStoppingCallback, Trainer
|
||||
from transformers.trainer_pt_utils import get_parameter_names
|
||||
|
||||
from axolotl.utils.callbacks import (
|
||||
SaveBetterTransformerModelCallback,
|
||||
SavePeftModelCallback,
|
||||
)
|
||||
from axolotl.utils.schedulers import (
|
||||
InterpolatingLogScheduler,
|
||||
get_cosine_schedule_with_quadratic_warmup,
|
||||
)
|
||||
from axolotl.utils.schedulers import InterpolatingLogScheduler
|
||||
|
||||
|
||||
class AxolotlTrainingArguments(TrainingArguments):
|
||||
"""
|
||||
Extend the base TrainingArguments for axolotl helpers
|
||||
"""
|
||||
|
||||
lr_quadratic_warmup: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Use quadratic warmup for cosine scheduling."},
|
||||
)
|
||||
|
||||
|
||||
class AxolotlTrainer(Trainer):
|
||||
"""
|
||||
Extend the base Trainer for axolotl helpers
|
||||
"""
|
||||
|
||||
args = None # type: AxolotlTrainingArguments
|
||||
|
||||
def create_scheduler(
|
||||
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
||||
):
|
||||
"""
|
||||
Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
|
||||
passed as an argument.
|
||||
|
||||
Args:
|
||||
num_training_steps (int): The number of training steps to do.
|
||||
optimizer (torch.optim.Optimizer): The training optimizer
|
||||
"""
|
||||
|
||||
# fmt: off
|
||||
if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
|
||||
# fmt: on
|
||||
if (
|
||||
self.args.lr_scheduler_type == "cosine"
|
||||
and self.args.lr_quadratic_warmup is True
|
||||
):
|
||||
self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
|
||||
optimizer,
|
||||
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
|
||||
num_training_steps=num_training_steps,
|
||||
)
|
||||
else:
|
||||
return super().create_scheduler(num_training_steps, optimizer)
|
||||
return self.lr_scheduler
|
||||
|
||||
|
||||
class OneCycleLRSchedulerTrainer(AxolotlTrainer):
|
||||
class OneCycleLRSchedulerTrainer(Trainer):
|
||||
"""
|
||||
Trainer subclass that uses the OneCycleLR scheduler
|
||||
"""
|
||||
@@ -160,9 +103,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
||||
if cfg.fsdp_config:
|
||||
training_arguments_kwargs["fsdp_config"] = dict(cfg.fsdp_config)
|
||||
|
||||
if cfg.lr_quadratic_warmup is not None:
|
||||
training_arguments_kwargs["lr_quadratic_warmup"] = cfg.lr_quadratic_warmup
|
||||
|
||||
# deepspeed
|
||||
if (
|
||||
os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true"
|
||||
@@ -184,11 +124,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
||||
if cfg.max_grad_norm:
|
||||
training_arguments_kwargs["max_grad_norm"] = cfg.max_grad_norm
|
||||
|
||||
if cfg.hub_model_id:
|
||||
training_arguments_kwargs["hub_model_id"] = cfg.hub_model_id
|
||||
if cfg.push_to_hub_model_id:
|
||||
training_arguments_kwargs["push_to_hub_model_id"] = cfg.push_to_hub_model_id
|
||||
training_arguments_kwargs["push_to_hub"] = True
|
||||
|
||||
training_args = AxolotlTrainingArguments(
|
||||
training_args = transformers.TrainingArguments(
|
||||
per_device_train_batch_size=cfg.micro_batch_size,
|
||||
per_device_eval_batch_size=cfg.eval_batch_size
|
||||
if cfg.eval_batch_size is not None
|
||||
@@ -197,9 +137,9 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
||||
eval_accumulation_steps=cfg.gradient_accumulation_steps,
|
||||
num_train_epochs=cfg.num_epochs,
|
||||
learning_rate=cfg.learning_rate,
|
||||
evaluation_strategy="steps" if cfg.val_set_size > 0 else "no",
|
||||
evaluation_strategy="steps",
|
||||
save_strategy="steps" if cfg.save_steps else "epoch",
|
||||
eval_steps=cfg.eval_steps if cfg.val_set_size > 0 else None,
|
||||
eval_steps=cfg.eval_steps,
|
||||
save_steps=cfg.save_steps,
|
||||
output_dir=cfg.output_dir,
|
||||
save_total_limit=3,
|
||||
@@ -335,23 +275,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
|
||||
num_proc=32,
|
||||
)
|
||||
|
||||
if cfg.compute_perplexity_metrics:
|
||||
|
||||
def compute_metrics(eval_preds: EvalPrediction) -> Dict[str, Any]:
|
||||
logits = eval_preds.predictions
|
||||
labels = eval_preds.label_ids
|
||||
cross_entropy_loss = F.cross_entropy(
|
||||
logits.view(-1, model.config.vocab_size), labels.view(-1)
|
||||
)
|
||||
perplexity = torch.exp(cross_entropy_loss)
|
||||
return {"cross_entropy_loss": cross_entropy_loss, "perplexity": perplexity}
|
||||
|
||||
trainer_kwargs["compute_metrics"] = compute_metrics
|
||||
|
||||
trainer_cls = (
|
||||
OneCycleLRSchedulerTrainer
|
||||
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
|
||||
else AxolotlTrainer
|
||||
else transformers.Trainer
|
||||
)
|
||||
trainer = trainer_cls(
|
||||
model=model,
|
||||
|
||||
@@ -87,16 +87,11 @@ def validate_config(cfg):
|
||||
"You probably want to disable group_by_length as it will force a streamed dataset to download completely."
|
||||
)
|
||||
|
||||
if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
|
||||
if any([cfg.adamw_beta1, cfg.adamw_beta2, cfg.adamw_epsilon]) and (
|
||||
not cfg.optimizer or "adamw" not in cfg.optimizer
|
||||
):
|
||||
logging.warning("adamw hyperparameters found, but no adamw optimizer set")
|
||||
|
||||
if cfg.push_to_hub_model_id:
|
||||
raise ValueError(
|
||||
"push_to_hub_model_id is deprecated. Please use hub_model_id instead."
|
||||
)
|
||||
|
||||
# TODO
|
||||
# MPT 7b
|
||||
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
||||
|
||||
@@ -27,7 +27,7 @@ class TestPacking(unittest.TestCase):
|
||||
}
|
||||
)
|
||||
|
||||
def test_resets_attention(self):
|
||||
def test_increments_attention(self):
|
||||
prompter = AlpacaPrompter("chat")
|
||||
strat = AlpacaPromptTokenizingStrategy(
|
||||
prompter,
|
||||
@@ -58,7 +58,7 @@ class TestPacking(unittest.TestCase):
|
||||
|
||||
# but subsequent one does
|
||||
assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
|
||||
assert example["attention_mask"][next_bos_index] == 0
|
||||
assert example["attention_mask"][next_bos_index] == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -268,7 +268,7 @@ class ValidationTest(unittest.TestCase):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"optimizer": None,
|
||||
"adam_epsilon": 0.0001,
|
||||
"adamw_epsilon": 0.0001,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -283,7 +283,7 @@ class ValidationTest(unittest.TestCase):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"optimizer": "adafactor",
|
||||
"adam_beta1": 0.0001,
|
||||
"adamw_beta1": 0.0001,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -298,9 +298,9 @@ class ValidationTest(unittest.TestCase):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"adam_beta1": 0.9,
|
||||
"adam_beta2": 0.99,
|
||||
"adam_epsilon": 0.0001,
|
||||
"adamw_beta1": 0.0001,
|
||||
"adamw_beta2": 0.0001,
|
||||
"adamw_epsilon": 0.0001,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user