Merge pull request #9 from winglian/dev

feature dump into main
This commit is contained in:
Wing Lian
2023-04-24 21:56:17 -04:00
committed by GitHub
12 changed files with 232 additions and 38 deletions

4
FAQS.md Normal file
View File

@@ -0,0 +1,4 @@
# FAQs
- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
- Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases

View File

@@ -0,0 +1,41 @@
base_model: facebook/galactica-1.3b
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
load_in_8bit: false
datasets:
- path: tatsu-lab/alpaca
type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
adapter:
lora_model_dir:
sequence_len: 1024
max_packed_sequence_len: 1024
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
lora_fan_in_fan_out: false
wandb_project:
wandb_watch:
wandb_run_id:
wandb_log_model: checkpoint
output_dir: ./lora-llama-alpaca
batch_size: 32
micro_batch_size: 16
num_epochs: 3
learning_rate: 0.00003
train_on_inputs: false
group_by_length: false
bf16: false
tf32: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
special_tokens:
pad_token: "[PAD]"
bos_token: "<s>"
eos_token: "</s>"
unk_token: "<unk>"

View File

@@ -0,0 +1,39 @@
base_model: huggyllama/llama-13b
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
load_in_8bit: true
datasets:
- path: anon8231489123/ShareGPT_Vicuna_unfiltered
data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
type: sharegpt
dataset_prepared_path: last_run_prepared
val_set_size: 0.002
adapter:
lora_model_dir:
sequence_len: 2048
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
lora_fan_in_fan_out: false
wandb_project:
wandb_watch:
wandb_run_id:
wandb_log_model: checkpoint
output_dir: ./llama-13b-sharegpt
batch_size: 64
micro_batch_size: 2
warmup_steps: 1000
save_steps:
eval_steps:
num_epochs: 5
learning_rate: 0.00003
train_on_inputs: false
group_by_length: false
bf16: true
tf32: true
early_stopping_patience: 5
resume_from_checkpoint:
local_rank:

View File

@@ -5,7 +5,8 @@ load_in_8bit: true
datasets: datasets:
- path: data/alpaca_data_gpt4.jsonl - path: data/alpaca_data_gpt4.jsonl
type: alpaca type: alpaca
- path: data/vicuna_cleaned.jsonl - path: anon8231489123/ShareGPT_Vicuna_unfiltered
data_files: ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json
type: sharegpt type: sharegpt
- path: data/gpt4-instruct-similarity-0.6-dataset.jsonl - path: data/gpt4-instruct-similarity-0.6-dataset.jsonl
type: gpteacher type: gpteacher
@@ -30,6 +31,8 @@ wandb_log_model: checkpoint
output_dir: ./lora-llama-alpaca output_dir: ./lora-llama-alpaca
batch_size: 128 batch_size: 128
micro_batch_size: 16 micro_batch_size: 16
warmup_steps: 1000
save_steps:
num_epochs: 5 num_epochs: 5
learning_rate: 0.00003 learning_rate: 0.00003
train_on_inputs: false train_on_inputs: false

33
configs/stability_3b.yml Normal file
View File

@@ -0,0 +1,33 @@
base_model: stabilityai/stablelm-base-alpha-3b
load_in_8bit: true
datasets:
- path: vicgalle/alpaca-gpt4
type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.04
adapter:
lora_model_dir:
sequence_len: 4096
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
- q_proj
- v_proj
lora_fan_in_fan_out: false
wandb_project: stable-llama-3b
wandb_watch:
wandb_run_id:
wandb_log_model: checkpoint
output_dir: ./stable-llama-3b
batch_size: 128
micro_batch_size: 16
num_epochs: 1
learning_rate: 0.00003
train_on_inputs: false
group_by_length: false
bf16: true
tf32: true
early_stopping_patience: 3
resume_from_checkpoint:
local_rank:

View File

@@ -11,11 +11,10 @@
"min_loss_scale": 1 "min_loss_scale": 1
}, },
"scheduler": { "scheduler": {
"type": "WarmupLR", "type": "OneCycle",
"params": { "params": {
"warmup_min_lr": "auto", "cycle_min_lr": 1e-7,
"warmup_max_lr": "auto", "cycle_max_lr": 1e-4
"warmup_num_steps": "auto"
} }
}, },
"zero_optimization": { "zero_optimization": {
@@ -25,7 +24,8 @@
"allgather_bucket_size": 5e8, "allgather_bucket_size": 5e8,
"contiguous_gradients": true, "contiguous_gradients": true,
"reduce_bucket_size": "auto", "reduce_bucket_size": "auto",
"reduce_scatter": true "reduce_scatter": true,
"stage3_gather_16bit_weights_on_model_save": true
}, },
"gradient_accumulation_steps": "auto", "gradient_accumulation_steps": "auto",
"gradient_clipping": "auto", "gradient_clipping": "auto",

View File

@@ -159,7 +159,7 @@ def train(
cfg.world_size = int(os.environ.get("WORLD_SIZE", 1)) cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0)) cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
choose_device(cfg) choose_device(cfg)
cfg.ddp = cfg.world_size != 1 cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
if cfg.ddp: if cfg.ddp:
cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))} cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
cfg.gradient_accumulation_steps = ( cfg.gradient_accumulation_steps = (

View File

@@ -1,3 +1,4 @@
import logging
from typing import List from typing import List
import torch import torch
@@ -92,11 +93,14 @@ class ConstantLengthDataset(IterableDataset):
: self.seq_length : self.seq_length
] ]
labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length] labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
yield { if labels.size() == input_ids.size() and attention_mask.size() == input_ids.size():
"input_ids": input_ids, yield {
"labels": labels, "input_ids": input_ids,
"attention_mask": attention_mask, "labels": labels,
} "attention_mask": attention_mask,
}
else:
logging.warning("dropping batch due to tensor size mismatch")
buffer = {"input_ids": [], "attention_mask": [], "labels": []} buffer = {"input_ids": [], "attention_mask": [], "labels": []}
buffer_len = 0 buffer_len = 0

View File

@@ -128,6 +128,10 @@ conv_vicuna_v1_1 = Conversation(
class ShareGPTPrompter: class ShareGPTPrompter:
def build_prompt(self, source, tokenizer): def build_prompt(self, source, tokenizer):
# ignore the system prompt if provided
if source[0]["from"] == "system":
source.pop(0)
if len(source) < 2: if len(source) < 2:
# If there isn't a back and forth conversation, ignore it # If there isn't a back and forth conversation, ignore it
# also happens on the data splitting leaving empty conversations # also happens on the data splitting leaving empty conversations

View File

@@ -2,7 +2,8 @@ import logging
from hashlib import md5 from hashlib import md5
from pathlib import Path from pathlib import Path
from datasets import load_from_disk, load_dataset, IterableDataset, Dataset from datasets import load_from_disk, load_dataset, IterableDataset, Dataset, concatenate_datasets
from huggingface_hub import hf_hub_download
from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset from axolotl.datasets import TokenizedPromptDataset, ConstantLengthDataset
from axolotl.prompt_tokenizers import ( from axolotl.prompt_tokenizers import (
@@ -30,7 +31,7 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
ds_hash = str( ds_hash = str(
md5( md5(
( (
str(max_packed_sequence_len) str(cfg.sequence_len)
+ "@" + "@"
+ "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets])) + "|".join(sorted([f"{d.path}:{d.type}" for d in cfg.datasets]))
).encode("utf-8") ).encode("utf-8")
@@ -43,13 +44,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
) )
if any(prepared_ds_path.glob("*")): if any(prepared_ds_path.glob("*")):
logging.info("Loading prepared dataset from disk...") logging.info(f"Loading prepared dataset from disk ay {prepared_ds_path}...")
dataset = load_from_disk(str(prepared_ds_path)) dataset = load_from_disk(str(prepared_ds_path))
logging.info("Prepared dataset loaded from disk...") logging.info("Prepared dataset loaded from disk...")
else: else:
logging.info(f"Unable to find prepared dataset in {prepared_ds_path}")
logging.info("Loading raw datasets...") logging.info("Loading raw datasets...")
datasets = [] datasets = []
for d in cfg.datasets: for d in cfg.datasets:
ds = None
ds_from_hub = False ds_from_hub = False
try: try:
load_dataset(d.path, streaming=True) load_dataset(d.path, streaming=True)
@@ -63,8 +66,14 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
"json", data_files=d.path, streaming=True, split=None "json", data_files=d.path, streaming=True, split=None
) )
elif ds_from_hub: elif ds_from_hub:
ds = load_dataset(d.path, streaming=True) if d.data_files:
ds = load_dataset(d.path, streaming=True, data_files=d.data_files)
else:
ds = load_dataset(d.path, streaming=True)
else: else:
fp = hf_hub_download(repo_id=d.path, repo_type="dataset", filename=d.data_files)
ds = load_dataset("json", data_files=fp, streaming=True, split=None)
if not ds:
raise Exception("unhandled dataset load") raise Exception("unhandled dataset load")
if d.type == "alpaca": if d.type == "alpaca":
@@ -105,20 +114,32 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
datasets.append(ds_wrapper) datasets.append(ds_wrapper)
else: else:
logging.error(f"unhandled prompt tokenization strategy: {d.type}") logging.error(f"unhandled prompt tokenization strategy: {d.type}")
constant_len_dataset = ConstantLengthDataset( logging.info("tokenizing, merging, and shuffling master dataset")
tokenizer,
datasets,
seq_length=max_packed_sequence_len,
)
logging.info("merging, packing, shuffling, and splitting master dataset")
dataset = Dataset.from_list([_ for _ in constant_len_dataset]).train_test_split(
test_size=cfg.val_set_size, shuffle=True, seed=42
)
samples = []
for d in datasets:
samples = samples + [i for i in d]
dataset = Dataset.from_list(samples).shuffle(seed=42)
if cfg.local_rank == 0: if cfg.local_rank == 0:
logging.info(f"Saving prepared dataset to disk... {prepared_ds_path}") logging.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
dataset.save_to_disk(prepared_ds_path) dataset.save_to_disk(prepared_ds_path)
if cfg.max_packed_sequence_len is not None:
constant_len_dataset = ConstantLengthDataset(
tokenizer,
[dataset],
seq_length=max_packed_sequence_len,
)
logging.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
dataset = Dataset.from_list([_ for _ in constant_len_dataset])
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
logging.info(f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards")
dataset = dataset.shard(num_shards=cfg.dataset_shard_num, index=cfg.dataset_shard_idx)
dataset = dataset.train_test_split(
test_size=cfg.val_set_size, shuffle=False
)
train_dataset = dataset["train"] train_dataset = dataset["train"]
eval_dataset = dataset["test"] eval_dataset = dataset["test"]

View File

@@ -7,11 +7,16 @@ import torch
import transformers import transformers
from transformers import ( from transformers import (
AutoModelForCausalLM, AutoModelForCausalLM,
LlamaForCausalLM,
LlamaTokenizer,
AutoTokenizer, AutoTokenizer,
PreTrainedModel, PreTrainedModel,
) )
try:
from transformers import (
LlamaForCausalLM,
LlamaTokenizer,
)
except:
logging.warning("This version of transformers does not support Llama. Consider upgrading.")
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
@@ -70,7 +75,7 @@ def load_model(
snapshot_download_kwargs = {} snapshot_download_kwargs = {}
if cfg.base_model_ignore_patterns: if cfg.base_model_ignore_patterns:
snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns snapshot_download_kwargs["ignore_patterns"] = cfg.base_model_ignore_patterns
cache_model_path = Path(snapshot_download(base_model, ** snapshot_download_kwargs)) cache_model_path = Path(snapshot_download(base_model, **snapshot_download_kwargs))
files = ( files = (
list(cache_model_path.glob("*.pt")) list(cache_model_path.glob("*.pt"))
+ list(cache_model_path.glob("*.safetensors")) + list(cache_model_path.glob("*.safetensors"))
@@ -95,15 +100,29 @@ def load_model(
else True, else True,
) )
load_in_8bit = False load_in_8bit = False
elif is_llama_derived_model: elif is_llama_derived_model and "LlamaForCausalLM" in globals():
model = LlamaForCausalLM.from_pretrained( if not cfg.load_in_8bit:
model = LlamaForCausalLM.from_pretrained(
base_model,
device_map=cfg.device_map,
)
else:
model = LlamaForCausalLM.from_pretrained(
base_model,
load_in_8bit=cfg.load_in_8bit,
torch_dtype=torch_dtype,
device_map=cfg.device_map,
)
elif model_type:
model = getattr(transformers, model_type).from_pretrained(
base_model, base_model,
load_in_8bit=cfg.load_in_8bit, load_in_8bit=cfg.load_in_8bit,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
device_map=cfg.device_map, device_map=cfg.device_map,
) )
else: else:
model = getattr(transformers, model_type).from_pretrained( model = AutoModelForCausalLM.from_pretrained(
base_model, base_model,
load_in_8bit=cfg.load_in_8bit, load_in_8bit=cfg.load_in_8bit,
torch_dtype=torch_dtype, torch_dtype=torch_dtype,
@@ -123,7 +142,7 @@ def load_model(
if not tokenizer: if not tokenizer:
try: try:
if is_llama_derived_model: if is_llama_derived_model and "LlamaTokenizer" in globals():
tokenizer = LlamaTokenizer.from_pretrained(model) tokenizer = LlamaTokenizer.from_pretrained(model)
else: else:
tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model) tokenizer = getattr(transformers, tokenizer_type).from_pretrained(model)
@@ -142,13 +161,17 @@ def load_model(
tokenizer.add_special_tokens({"pad_token": "[PAD]"}) tokenizer.add_special_tokens({"pad_token": "[PAD]"})
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
if cfg.special_tokens:
for k, v in cfg.special_tokens.items():
setattr(tokenizer, k, v)
if load_in_8bit and not cfg.load_4bit: if load_in_8bit and not cfg.load_4bit:
logging.info("converting model w/ prepare_model_for_int8_training") logging.info("converting model w/ prepare_model_for_int8_training")
model = prepare_model_for_int8_training(model) model = prepare_model_for_int8_training(model)
model, lora_config = load_adapter(model, cfg, adapter) model, lora_config = load_adapter(model, cfg, adapter)
if cfg.ddp: if cfg.ddp and not load_in_8bit:
model.to(f"cuda:{cfg.local_rank}") model.to(f"cuda:{cfg.local_rank}")
if cfg.load_4bit: if cfg.load_4bit:

View File

@@ -1,5 +1,9 @@
import math import math
import os
from pathlib import Path
import bitsandbytes as bnb import bitsandbytes as bnb
import torch.cuda
import transformers import transformers
from torch import nn from torch import nn
from torch.optim.lr_scheduler import OneCycleLR from torch.optim.lr_scheduler import OneCycleLR
@@ -12,7 +16,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size) math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
) )
warmup_steps = cfg.warmup_steps if cfg.warmup_steps else min(int(0.03 * total_num_steps), 100) warmup_steps = cfg.warmup_steps if cfg.warmup_steps else min(int(0.03 * total_num_steps), 100)
logging_steps = max(min(int(0.005 * total_num_steps), 10), 1) logging_steps = cfg.logging_steps if cfg.logging_steps else max(min(int(0.005 * total_num_steps), 10), 1)
save_steps = eval_steps = cfg.save_steps if cfg.save_steps else min(int(0.05 * total_num_steps), 200) save_steps = eval_steps = cfg.save_steps if cfg.save_steps else min(int(0.05 * total_num_steps), 200)
training_arguments_kwargs = {} training_arguments_kwargs = {}
@@ -26,6 +30,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
if cfg.gradient_checkpointing is not None: if cfg.gradient_checkpointing is not None:
training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing training_arguments_kwargs["gradient_checkpointing"] = cfg.gradient_checkpointing
# deepspeed
if os.environ.get("ACCELERATE_USE_DEEPSPEED") == "true" and torch.cuda.device_count() > 1:
if cfg.deepspeed:
training_arguments_kwargs["deepspeed"] = cfg.deepspeed
else:
# make a guess here
# TODO search Path("./") for one
training_arguments_kwargs["deepspeed"] = "./ds_config.json"
training_args = transformers.TrainingArguments( training_args = transformers.TrainingArguments(
per_device_train_batch_size=cfg.micro_batch_size, per_device_train_batch_size=cfg.micro_batch_size,
gradient_accumulation_steps=cfg.gradient_accumulation_steps, gradient_accumulation_steps=cfg.gradient_accumulation_steps,
@@ -37,7 +50,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
save_steps=save_steps, save_steps=save_steps,
output_dir=cfg.output_dir, output_dir=cfg.output_dir,
save_total_limit=3, save_total_limit=3,
load_best_model_at_end=True if cfg.val_set_size > 0 else False, load_best_model_at_end=True if cfg.val_set_size > 0 and save_steps % eval_steps == 0 else False,
ddp_find_unused_parameters=False if cfg.ddp else None, ddp_find_unused_parameters=False if cfg.ddp else None,
group_by_length=cfg.group_by_length, group_by_length=cfg.group_by_length,
report_to="wandb" if cfg.use_wandb else None, report_to="wandb" if cfg.use_wandb else None,
@@ -47,7 +60,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
trainer_kwargs = {} trainer_kwargs = {}
if cfg.load_in_8bit and not cfg.load_4bit: if cfg.optimizer == "adam8bit" and not cfg.load_4bit and not "deepspeed" in training_arguments_kwargs:
decay_parameters = get_parameter_names(model, [nn.LayerNorm]) decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name] decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [ optimizer_grouped_parameters = [
@@ -94,13 +107,22 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer):
) )
trainer_kwargs["callbacks"] = [early_stop_cb] trainer_kwargs["callbacks"] = [early_stop_cb]
data_collator_kwargs = {
"padding": True,
}
if cfg.collator_pad_to_longest:
data_collator_kwargs["padding"] = "longest"
else:
data_collator_kwargs["pad_to_multiple_of"] = 8
trainer = transformers.Trainer( trainer = transformers.Trainer(
model=model, model=model,
train_dataset=train_dataset, train_dataset=train_dataset,
eval_dataset=eval_dataset, eval_dataset=eval_dataset,
args=training_args, args=training_args,
data_collator=transformers.DataCollatorForSeq2Seq( data_collator=transformers.DataCollatorForSeq2Seq(
tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True tokenizer,
return_tensors="pt",
**data_collator_kwargs,
), ),
**trainer_kwargs, **trainer_kwargs,
) )