Merge pull request #9 from winglian/dev

feature dump into main
This commit is contained in:
Wing Lian
2023-04-24 21:56:17 -04:00
committed by GitHub
12 changed files with 232 additions and 38 deletions

4
FAQS.md Normal file
View File

@@ -0,0 +1,4 @@
# FAQs
- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
- Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases

View File

@@ -0,0 +1,41 @@
base_model: facebook/galactica-1.3b
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: false
datasets:
- path: tatsu-lab/alpaca
type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
adapter:
lora_model_dir:
sequence_len: 1024
max_packed_sequence_len: 1024
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
lora_fan_in_fan_out: false
wandb_project:
wandb_watch:
wandb_run_id:
wandb_log_model: checkpoint
output_dir: ./lora-llama-alpaca
batch_size: 32
micro_batch_size: 16
num_epochs: 3
learning_rate: 0.00003
train_on_inputs: false
group_by_length: false
bf16: false
tf32: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
special_tokens:
pad_token: "[PAD]"
bos_token: "<s>"
eos_token: "</s>"
unk_token: "<unk>"

View File

@@ -0,0 +1,39 @@
base_model: huggyllama/llama-13b
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
datasets:
- path: anon8231489123/ShareGPT_Vicuna_unfiltered
data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
type: sharegpt
dataset_prepared_path: last_run_prepared
val_set_size: 0.002
adapter:
lora_model_dir:
sequence_len: 2048
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
lora_fan_in_fan_out: false
wandb_project:
wandb_watch:
wandb_run_id:
wandb_log_model: checkpoint
output_dir: ./llama-13b-sharegpt
batch_size: 64
micro_batch_size: 2
warmup_steps: 1000
save_steps:
eval_steps:
num_epochs: 5
learning_rate: 0.00003
train_on_inputs: false
group_by_length: false
bf16: true
tf32: true
early_stopping_patience: 5
resume_from_checkpoint:
local_rank:

View File

@@ -5,7 +5,8 @@ load_in_8bit: true
datasets:
- path: data/alpaca_data_gpt4.jsonl
type: alpaca
- path: data/vicuna_cleaned.jsonl
- path: anon8231489123/ShareGPT_Vicuna_unfiltered
data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
type: sharegpt
- path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
type: gpteacher
@@ -30,6 +31,8 @@ wandb_log_model: checkpoint
output_dir: ./lora-llama-alpaca
batch_size: 128
micro_batch_size: 16
warmup_steps: 1000
save_steps:
num_epochs: 5
learning_rate: 0.00003
train_on_inputs: false

33
configs/stability_3b.yml Normal file
View File

@@ -0,0 +1,33 @@
base_model: stabilityai/stablelm-base-alpha-3b
load_in_8bit: true
datasets:
- path: vicgalle/alpaca-gpt4
type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.04
adapter:
lora_model_dir:
sequence_len: 4096
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
lora_fan_in_fan_out: false
wandb_project: stable-llama-3b
wandb_watch:
wandb_run_id:
wandb_log_model: checkpoint
output_dir: ./stable-llama-3b
batch_size: 128
micro_batch_size: 16
num_epochs: 1
learning_rate: 0.00003
train_on_inputs: false
group_by_length: false
bf16: true
tf32: true
early_stopping_patience: 3
resume_from_checkpoint:
local_rank:

View File

@@ -11,11 +11,10 @@
"min_loss_scale": 1
},
"scheduler": {
"type": "WarmupLR",
"type": "OneCycle",
"params": {
"warmup_min_lr": "auto",
"warmup_max_lr": "auto",
"warmup_num_steps": "auto"
"cycle_min_lr": 1e-7,
"cycle_max_lr": 1e-4
}
},
"zero_optimization": {
@@ -25,7 +24,8 @@
"allgather_bucket_size": 5e8,
"contiguous_gradients": true,
"reduce_bucket_size": "auto",
"reduce_scatter": true
"reduce_scatter": true,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": "auto",

View File

@@ -159,7 +159,7 @@ def train(
cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
choose_device(cfg)
cfg.ddp = cfg.world_size != 1
cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
if cfg.ddp:
cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
cfg.gradient_accumulation_steps = (

View File

@@ -1,3 +1,4 @@
import logging
from typing import List
import torch
@@ -92,11 +93,14 @@ class ConstantLengthDataset(IterableDataset):
: self.seq_length
]
labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
yield {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
}
if labels.size() == input_ids.size() and attention_mask.size() == input_ids.size():
yield {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
}
else:
logging.warning("dropping batch due to tensor size mismatch")
buffer = {"input_ids": [], "attention_mask": [], "labels": []}
buffer_len = 0

View File

@@ -128,6 +128,10 @@ conv_vicuna_v1_1 = Conversation(
class ShareGPTPrompter:
def build_prompt(self, source, tokenizer):
# ignore the system prompt if provided
if source[0]["from"] == "system":
source.pop(0)
if len(source) < 2:
# If there isn't a back and forth conversation, ignore it
# also happens on the data splitting leaving empty conversations

View File

@@ -2,7 +2,8 @@ import logging
from hashlib import md5
from pathlib import Path
from datasets import load_from_disk, load_dataset, IterableDataset, Dataset
from datasets import load_from_disk, load_dataset, IterableDataset, Dataset, concatenate_datasets
from huggingface_hub import hf_hub_download
from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset
from axolotl.prompt_tokenizers import (
@@ -30,7 +31,7 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
ds_hash = str(
md5(
(
str(max_packed_sequence_len)
str(cfg.sequence_len)
+ "@"
+ "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets]))
).encode("utf-8")
@@ -43,13 +44,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
)
if any(prepared_ds_path.glob("*")):
logging.info("Loading prepared dataset from disk...")
logging.info(f"Loading prepared dataset from disk ay {prepared_ds_path}...")
dataset = load_from_disk(str(prepared_ds_path))
logging.info("Prepared dataset loaded from disk...")
else:
logging.info(f"Unable to find prepared dataset in {prepared_ds_path}")
logging.info("Loading raw datasets...")
datasets = []
for d in cfg.datasets:
ds = None
ds_from_hub = False
try:
load_dataset(d.path, streaming=True)
@@ -63,8 +66,14 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
"json", data_files=d.path, streaming=True, split=None
)
elif ds_from_hub:
ds = load_dataset(d.path, streaming=True)
if d.data_files:
ds = load_dataset(d.path, streaming=True, data_files=d.data_files)
else:
ds = load_dataset(d.path, streaming=True)
else:
fp = hf_hub_download(repo_id=d.path, repo_type="dataset", filename=d.data_files)
ds = load_dataset("json", data_files=fp, streaming=True, split=None)
if not ds:
raise Exception("unhandled dataset load")
if d.type == "alpaca":
@@ -105,20 +114,32 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
datasets.append(ds_wrapper)
else:
logging.error(f"unhandled prompt tokenization strategy: {d.type}")
constant_len_dataset = ConstantLengthDataset(
tokenizer,
datasets,
seq_length=max_packed_sequence_len,
)
logging.info("merging, packing, shuffling, and splitting master dataset")
dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split(
test_size=cfg.val_set_size, shuffle=True, seed=42
)
logging.info("tokenizing, merging, and shuffling master dataset")
samples = []
for d in datasets:
samples = samples + [i for i in d]
dataset = Dataset.from_list(samples).shuffle(seed=42)
if cfg.local_rank == 0:
logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}")
logging.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
dataset.save_to_disk(prepared_ds_path)
if cfg.max_packed_sequence_len is not None:
constant_len_dataset = ConstantLengthDataset(
tokenizer,
[dataset],
seq_length=max_packed_sequence_len,
)
logging.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
dataset = Dataset.from_list([_ for _ in constant_len_dataset])
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
logging.info(f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards")
dataset = dataset.shard(num_shards=cfg.dataset_shard_num, index=cfg.dataset_shard_idx)
dataset = dataset.train_test_split(
test_size=cfg.val_set_size, shuffle=False
)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

View File

@@ -7,11 +7,16 @@ import torch
import transformers
from transformers import (
AutoModelForCausalLM,
LlamaForCausalLM,
LlamaTokenizer,
AutoTokenizer,
PreTrainedModel,
)
try:
from transformers import (
LlamaForCausalLM,
LlamaTokenizer,
)
except:
logging.warning("This version of transformers does not support Llama. Consider upgrading.")
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
@@ -70,7 +75,7 @@ def load_model(
snapshot_download_kwargs = {}
if cfg.base_model_ignore_patterns:
snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns
cache_model_path = Path(snapshot_download(base_model, ** snapshot_download_kwargs))
cache_model_path = Path(snapshot_download(base_model, **snapshot_download_kwargs))
files = (
list(cache_model_path.glob("*.pt"))
+ list(cache_model_path.glob("*.safetensors"))
@@ -95,15 +100,29 @@ def load_model(
else True,
)
load_in_8bit = False
elif is_llama_derived_model:
model = LlamaForCausalLM.from_pretrained(
elif is_llama_derived_model and "LlamaForCausalLM" in globals():
if not cfg.load_in_8bit:
model = LlamaForCausalLM.from_pretrained(
base_model,
device_map=cfg.device_map,
)
else:
model = LlamaForCausalLM.from_pretrained(
base_model,
load_in_8bit=cfg.load_in_8bit,
torch_dtype=torch_dtype,
device_map=cfg.device_map,
)
elif model_type:
model = getattr(transformers, model_type).from_pretrained(
base_model,
load_in_8bit=cfg.load_in_8bit,
torch_dtype=torch_dtype,
device_map=cfg.device_map,
)
else:
model = getattr(transformers, model_type).from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
base_model,
load_in_8bit=cfg.load_in_8bit,
torch_dtype=torch_dtype,
@@ -123,7 +142,7 @@ def load_model(
if not tokenizer:
try:
if is_llama_derived_model:
if is_llama_derived_model and "LlamaTokenizer" in globals():
tokenizer = LlamaTokenizer.from_pretrained(model)
else:
tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model)
@@ -142,13 +161,17 @@ def load_model(
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if cfg.special_tokens:
for k, v in cfg.special_tokens.items():
setattr(tokenizer, k, v)
if load_in_8bit and not cfg.load_4bit:
logging.info("converting model w/ prepare_model_for_int8_training")
model = prepare_model_for_int8_training(model)
model, lora_config = load_adapter(model, cfg, adapter)
if cfg.ddp:
if cfg.ddp and not load_in_8bit:
model.to(f"cuda:{cfg.local_rank}")
if cfg.load_4bit:

View File

@@ -1,5 +1,9 @@
import math
import os
from pathlib import Path
import bitsandbytes as bnb
import torch.cuda
import transformers
from torch import nn
from torch.optim.lr_scheduler import OneCycleLR
@@ -12,7 +16,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
)
warmup_steps = cfg.warmup_steps if cfg.warmup_steps else min(int(0.03 * total_num_steps), 100)
logging_steps = max(min(int(0.005 * total_num_steps), 10), 1)
logging_steps = cfg.logging_steps if cfg.logging_steps else max(min(int(0.005 * total_num_steps), 10), 1)
save_steps = eval_steps = cfg.save_steps if cfg.save_steps else min(int(0.05 * total_num_steps), 200)
training_arguments_kwargs = {}
@@ -26,6 +30,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
if cfg.gradient_checkpointing is not None:
training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing
# deepspeed
if os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" and torch.cuda.device_count() > 1:
if cfg.deepspeed:
training_arguments_kwargs["deepspeed"] = cfg.deepspeed
else:
# make a guess here
# TODO search Path("./") for one
training_arguments_kwargs["deepspeed"] = "./ds_config.json"
training_args = transformers.TrainingArguments(
per_device_train_batch_size=cfg.micro_batch_size,
gradient_accumulation_steps=cfg.gradient_accumulation_steps,
@@ -37,7 +50,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
save_steps=save_steps,
output_dir=cfg.output_dir,
save_total_limit=3,
load_best_model_at_end=True if cfg.val_set_size > 0 else False,
load_best_model_at_end=True if cfg.val_set_size > 0 and save_steps % eval_steps == 0 else False,
ddp_find_unused_parameters=False if cfg.ddp else None,
group_by_length=cfg.group_by_length,
report_to="wandb" if cfg.use_wandb else None,
@@ -47,7 +60,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
trainer_kwargs = {}
if cfg.load_in_8bit and not cfg.load_4bit:
if cfg.optimizer == "adam8bit" and not cfg.load_4bit and not "deepspeed" in training_arguments_kwargs:
decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
@@ -94,13 +107,22 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
)
trainer_kwargs["callbacks"] = [early_stop_cb]
data_collator_kwargs = {
"padding": True,
}
if cfg.collator_pad_to_longest:
data_collator_kwargs["padding"] = "longest"
else:
data_collator_kwargs["pad_to_multiple_of"] = 8
trainer = transformers.Trainer(
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
args=training_args,
data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
tokenizer,
return_tensors="pt",
**data_collator_kwargs,
),
**trainer_kwargs,
)