Compare commits

..

10 Commits

Author SHA1 Message Date
Wing Lian
da154e6d56 support for json data as completion 2023-11-25 16:05:04 -05:00
NanoCode012
1115c501b8 Feat: Add Qwen (#894)
* Feat: Add Qwen

* feat: add qwen lora example

* feat: update matrix

* fix: add trust_remote_code

* fix: disable gradient checkpointing

* chore: add warning about gradient checkpointing

* fix: config

* fix: turn off sample packing for this example and reduce seq len

* chore: add comment on seq len
2023-11-26 00:05:01 +09:00
NanoCode012
7ee3c4cacb fix: warning should not show if eval_batch_size not provided (#896) 2023-11-25 16:04:00 +09:00
NanoCode012
fb12895a17 Feat: Add warmup_ratio (#893)
* Feat: Add warmup_ratio

* fix: update readme with more details on conflict
2023-11-25 12:15:43 +09:00
NanoCode012
9fc29e082b chore(doc): Add info on changing role in sharegpt (#886) 2023-11-22 15:32:50 +09:00
NanoCode012
575a082aae fix: revert local dir dataset load (#878) 2023-11-18 22:50:41 +09:00
Mark Saroufim
ddf815022a Install from git url (#874)
* Install from git url

* Update README.md
2023-11-17 12:50:51 -05:00
Wing Lian
9bf854e59c Phi update 202311 (#876)
* add phi modeling from hf

* update for packing and use new modeling class for phi

* update e2e tests for phi to use new model name

* update example phi to also use new phi model name

* use AutoModelForCausalLM for phi lora since sample packing isn't supported
2023-11-17 12:47:17 -05:00
Wing Lian
797f3dd1de don't train if eval split is too small (#873)
* allow zero len dataset

* better handling and warning of small eval splits

* raise error if eval split is too small

* don't mess with calculating total num steps in distributed context

* fix eval_sample_packing training args logic
2023-11-16 11:35:42 -05:00
Wing Lian
0de1457189 try #2: pin hf transformers and accelerate to latest release, don't reinstall pytorch (#867)
* isolate torch from the requirements.txt

* fix typo for removed line ending

* pin transformers and accelerate to latest releases

* try w auto-gptq==0.5.1

* update README to remove manual peft install

* pin xformers to 0.0.22

* bump flash-attn to 2.3.3

* pin flash attn to exact version
2023-11-16 10:42:36 -05:00
19 changed files with 1431 additions and 41 deletions

View File

@@ -71,6 +71,7 @@ jobs:
- name: Install dependencies
run: |
pip3 install --extra-index-url https://download.pytorch.org/whl/cu118 -U torch==2.0.1
pip3 uninstall -y transformers accelerate
pip3 install -U -e .[flash-attn]
pip3 install -r requirements-tests.txt

View File

@@ -77,6 +77,7 @@ Features:
| XGen | ✅ | ❓ | ✅ | ❓ | ❓ | ❓ | ✅ |
| phi | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
| RWKV | ✅ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ |
| Qwen | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
## Quickstart ⚡
@@ -85,14 +86,19 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo
**Requirements**: Python >=3.9 and Pytorch >=2.0.
`pip3 install "axolotl[flash-attn,deepspeed] @ git+https://github.com/OpenAccess-AI-Collective/axolotl"`
### For developers
```bash
git clone https://github.com/OpenAccess-AI-Collective/axolotl
cd axolotl
pip3 install packaging
pip3 install -e '.[flash-attn,deepspeed]'
pip3 install -U git+https://github.com/huggingface/peft.git
```
### Usage
```bash
# finetune lora
accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
@@ -494,6 +500,7 @@ is_falcon_derived_model:
is_llama_derived_model:
# Please note that if you set this to true, `padding_side` will be set to "left" by default
is_mistral_derived_model:
is_qwen_derived_model:
# optional overrides to the base model configuration
model_config:
@@ -538,6 +545,8 @@ datasets:
# Optional[str] fastchat conversation type, only used with type: sharegpt
conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
field_human: # Optional[str]. Human key to use for conversation.
field_model: # Optional[str]. Assistant key to use for conversation.
# Custom user prompt
- path: repo
@@ -668,7 +677,8 @@ gradient_accumulation_steps: 1
micro_batch_size: 2
eval_batch_size:
num_epochs: 4
warmup_steps: 100
warmup_steps: 100 # cannot use with warmup_ratio
warmup_ratio: 0.05 # cannot use with warmup_steps
learning_rate: 0.00003
lr_quadratic_warmup:
logging_steps:

View File

@@ -4,20 +4,19 @@ model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
is_llama_derived_model: true
load_in_8bit: false
load_in_8bit: true
load_in_4bit: false
strict: false
datasets:
- path: mhenrichsen/context-aware-splits-english
- path: mhenrichsen/alpaca_2k_test
type: alpaca
dataset_prepared_path:
val_set_size: 200
output_dir: ./tiny-llama
val_set_size: 0.05
output_dir: ./lora-out
sequence_len: 8192
sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true
adapter: lora
lora_model_dir:
@@ -33,9 +32,9 @@ wandb_watch:
wandb_run_id:
wandb_log_model:
gradient_accumulation_steps: 1
micro_batch_size: 8
num_epochs: 3
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
@@ -54,13 +53,13 @@ logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 50
warmup_steps: 10
eval_steps: 0.05
eval_table_size:
save_steps: 0.50
save_steps:
debug:
deepspeed:
weight_decay: 0.1
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:

View File

@@ -1,5 +1,5 @@
base_model: microsoft/phi-1_5
model_type: MixFormerSequentialForCausalLM
model_type: PhiForCausalLM
tokenizer_type: AutoTokenizer
is_llama_derived_model: false
trust_remote_code: true

68
examples/qwen/lora.yml Normal file
View File

@@ -0,0 +1,68 @@
base_model: Qwen/Qwen-7B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
is_qwen_derived_model: true
trust_remote_code: true
load_in_8bit: true
load_in_4bit: false
strict: false
datasets:
- path: mhenrichsen/alpaca_2k_test
type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./lora-out
sequence_len: 2048 # supports up to 8192
sample_packing: false
pad_to_sequence_len:
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:
wandb_project:
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false
gradient_checkpointing: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 10
eval_steps: 0.05
eval_table_size:
eval_table_max_new_tokens: 128
save_steps:
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:

68
examples/qwen/qlora.yml Normal file
View File

@@ -0,0 +1,68 @@
base_model: Qwen/Qwen-7B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
is_qwen_derived_model: true
trust_remote_code: true
load_in_8bit: false
load_in_4bit: true
strict: false
datasets:
- path: mhenrichsen/alpaca_2k_test
type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./lora-out
sequence_len: 2048 # supports up to 8192
sample_packing: false
pad_to_sequence_len:
adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:
wandb_project:
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false
gradient_checkpointing: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 10
eval_steps: 0.05
eval_table_size:
eval_table_max_new_tokens: 128
save_steps:
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:

View File

@@ -1,22 +1,20 @@
--extra-index-url https://download.pytorch.org/whl/cu118
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
torch==2.0.1
auto-gptq==0.4.2
auto-gptq==0.5.1
packaging
peft==0.6.0
transformers @ git+https://github.com/huggingface/transformers.git@acc394c4f5e1283c19783581790b3dc3105a3697
transformers==4.35.1
bitsandbytes>=0.41.1
accelerate @ git+https://github.com/huggingface/accelerate@80da9cfb09bb3cc9f1b385cb55d6b90d025a5fd9
accelerate==0.24.1
deepspeed
addict
fire
PyYAML>=6.0
datasets>=2.14.0
flash-attn>=2.3.0
flash-attn==2.3.3
sentencepiece
wandb
einops
xformers>=0.0.22
xformers==0.0.22
optimum==1.13.2
hf_transfer
colorama

View File

@@ -461,11 +461,14 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
return AxolotlTrainer
def build(self, total_num_steps):
warmup_steps = (
self.cfg.warmup_steps
if self.cfg.warmup_steps is not None
else min(int(0.03 * total_num_steps), 100)
)
warmup_steps = None
if self.cfg.warmup_steps is not None:
warmup_steps = self.cfg.warmup_steps
elif self.cfg.warmup_ratio is not None:
warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0)
else:
warmup_steps = min(int(0.03 * total_num_steps), 100)
logging_steps = (
self.cfg.logging_steps
if self.cfg.logging_steps is not None
@@ -658,7 +661,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
self.cfg.sample_packing if self.cfg.sample_packing else False
)
training_arguments_kwargs["eval_sample_packing"] = (
self.cfg.sample_packing if self.cfg.sample_packing else False
self.cfg.sample_packing
if self.cfg.eval_sample_packing is not False
else False
)
training_arguments_kwargs[
"sample_packing_seq_len_multiplier"

View File

@@ -3,4 +3,6 @@ MixFormers model architecture used for phi models
"""
from .configuration_mixformer_sequential import MixFormerSequentialConfig # noqa
from .configuration_phi import PhiConfig # noqa
from .modeling_mixformer_sequential import MixFormerSequentialForCausalLM # noqa
from .modeling_phi import PhiForCausalLM # noqa

View File

@@ -0,0 +1,65 @@
# pylint: skip-file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import math
from typing import Optional
from transformers import PretrainedConfig
class PhiConfig(PretrainedConfig):
"""Phi configuration."""
model_type = "phi"
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size: int = 50304,
n_positions: int = 2048,
n_embd: int = 1024,
n_layer: int = 20,
n_inner: Optional[int] = None,
n_head: int = 16,
n_head_kv: Optional[int] = None,
rotary_dim: Optional[int] = 32,
activation_function: Optional[str] = "gelu_new",
flash_attn: bool = False,
flash_rotary: bool = False,
fused_dense: bool = False,
attn_pdrop: float = 0.0,
embd_pdrop: float = 0.0,
resid_pdrop: float = 0.0,
layer_norm_epsilon: float = 1e-5,
initializer_range: float = 0.02,
tie_word_embeddings: bool = False,
pad_vocab_size_multiple: int = 64,
**kwargs
) -> None:
self.vocab_size = int(
math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
)
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_inner = n_inner
self.n_head = n_head
self.n_head_kv = n_head_kv
self.rotary_dim = min(rotary_dim, n_embd // n_head)
self.activation_function = activation_function
self.flash_attn = flash_attn
self.flash_rotary = flash_rotary
self.fused_dense = fused_dense
self.attn_pdrop = attn_pdrop
self.embd_pdrop = embd_pdrop
self.resid_pdrop = resid_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +1,7 @@
"""
Basic completion text
"""
import json
from collections import defaultdict
from typing import Any, Dict, Generator, Optional, Tuple
@@ -64,6 +65,19 @@ class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
return next(iter(self.prompter.build_prompt(instruction, input, response)))
class CompletionJSONPromptTokenizationStrategy(CompletionPromptTokenizingStrategy):
"""
Strategy to return the stringified JSON of the entire row as the training data
"""
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
return (
json.dumps(prompt),
"",
"",
)
class CompletionPrompter:
"""
Prompter for completion
@@ -82,7 +96,7 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
strat = CompletionPromptTokenizingStrategy(
CompletionPrompter(),
tokenizer,
cfg.train_on_inputs,
True,
cfg.sequence_len,
max_length=cfg.sequence_len * 64,
)
@@ -90,3 +104,15 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
strat.field = ds_cfg["field"]
return strat
def load_json(tokenizer, cfg):
strat = CompletionJSONPromptTokenizationStrategy(
CompletionPrompter(),
tokenizer,
True,
cfg.sequence_len,
max_length=cfg.sequence_len * 64,
)
return strat

View File

@@ -122,6 +122,19 @@ def normalize_config(cfg):
or (cfg.model_type and "mistral" in cfg.model_type.lower())
)
cfg.is_qwen_derived_model = (
(
hasattr(model_config, "model_type")
and model_config.model_type
in [
"qwen",
]
)
or cfg.is_qwen_derived_model
or "qwen" in cfg.base_model.lower()
or (cfg.model_type and "qwen" in cfg.model_type.lower())
)
if isinstance(cfg.learning_rate, str):
cfg.learning_rate = float(cfg.learning_rate)
@@ -165,7 +178,11 @@ def validate_config(cfg):
"batch_size is not recommended. Please use gradient_accumulation_steps instead.",
"To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
)
if cfg.eval_batch_size != cfg.micro_batch_size:
if (
cfg.eval_batch_size
and cfg.micro_batch_size
and cfg.eval_batch_size != cfg.micro_batch_size
):
LOG.warning(
"eval_batch_size != micro_batch_size. This can lead to VRAM instability."
)
@@ -372,6 +389,14 @@ def validate_config(cfg):
if cfg.rope_scaling:
LOG.warning("`rope_scaling` should now be be a key under `model_config`")
if cfg.warmup_steps and cfg.warmup_ratio:
raise ValueError("warmup_steps and warmup_ratio are mutually exclusive")
if cfg.is_qwen_derived_model and cfg.gradient_checkpointing:
LOG.warning(
"Gradient checkpointing is broken for Qwen models for transformers>=4.35.0, except main branch."
)
# TODO
# MPT 7b
# https://github.com/facebookresearch/bitsandbytes/issues/25

View File

@@ -79,6 +79,14 @@ def prepare_dataset(cfg, tokenizer):
train_dataset, eval_dataset = process_datasets_for_packing(
cfg, train_dataset, eval_dataset, tokenizer
)
if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
if total_eval_steps == 0:
raise ValueError(
"eval dataset split is too small for sample_packing. You should set `eval_sample_packing: False`. "
)
if cfg.max_steps:
total_num_steps = min(
calculate_total_num_steps(cfg, train_dataset), cfg.max_steps
@@ -234,7 +242,14 @@ def load_tokenized_prepared_datasets(
local_path = Path(config_dataset.path)
if local_path.exists():
if local_path.is_dir():
ds = load_from_disk(config_dataset.path)
# TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
ds = load_dataset(
config_dataset.path,
name=config_dataset.name,
data_files=config_dataset.data_files,
streaming=False,
split=None,
)
elif local_path.is_file():
ds_type = get_ds_type(config_dataset)

View File

@@ -84,6 +84,18 @@ def load_tokenizer(cfg):
if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
tokenizer.padding_side = "left"
# Qwen base only has single token, so we need to set the special tokens
if cfg.is_qwen_derived_model:
token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
for attr_name in token_ids:
if getattr(tokenizer, attr_name) is None:
setattr(tokenizer, attr_name, tokenizer.eod_id)
token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
for attr_name in token_names:
if getattr(tokenizer, attr_name) is None:
setattr(tokenizer, attr_name, "<|endoftext|>")
if cfg.special_tokens:
for k, val in cfg.special_tokens.items():
tokenizer.add_special_tokens(
@@ -288,10 +300,10 @@ def load_model(
# device=cfg.device,
# )
# model.train() # sets to train instead of eval mode
elif model_type == "MixFormerSequentialForCausalLM":
from axolotl.models.phi import MixFormerSequentialForCausalLM
elif model_type == "PhiForCausalLM":
from axolotl.models.phi import PhiForCausalLM
model = MixFormerSequentialForCausalLM.from_pretrained(
model = PhiForCausalLM.from_pretrained(
base_model,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,

View File

@@ -182,7 +182,7 @@ class MultipackBatchSampler(BatchSampler):
# shave off 1% + 1 for dealing with variance in packing from random sampler to sampler
return max(
1,
0,
(
world_size
* math.floor(

View File

@@ -141,7 +141,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
return train_dataset, eval_dataset
def calculate_total_num_steps(cfg, train_dataset):
def calculate_total_num_steps(cfg, train_dataset, update=True):
if not cfg.total_num_tokens:
total_num_tokens = np.sum(
train_dataset.data.column("input_ids")
@@ -150,7 +150,8 @@ def calculate_total_num_steps(cfg, train_dataset):
.values
)
LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True)
cfg.total_num_tokens = total_num_tokens
if update:
cfg.total_num_tokens = total_num_tokens
if not cfg.total_supervised_tokens:
total_supervised_tokens = (
@@ -163,7 +164,8 @@ def calculate_total_num_steps(cfg, train_dataset):
f"`total_supervised_tokens: {total_supervised_tokens}`",
main_process_only=True,
)
cfg.total_supervised_tokens = total_supervised_tokens
if update:
cfg.total_supervised_tokens = total_supervised_tokens
if cfg.sample_packing:
# we have to drop anything longer then sequence len otherwise
@@ -232,7 +234,8 @@ def calculate_total_num_steps(cfg, train_dataset):
sample_packing_eff_est = (
math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
)
cfg.sample_packing_eff_est = sample_packing_eff_est
if update:
cfg.sample_packing_eff_est = sample_packing_eff_est
LOG.debug(
f"sample_packing_eff_est: {cfg.sample_packing_eff_est}",
main_process_only=True,

View File

@@ -31,7 +31,7 @@ class TestPhi(unittest.TestCase):
{
"base_model": "microsoft/phi-1_5",
"trust_remote_code": True,
"model_type": "MixFormerSequentialForCausalLM",
"model_type": "PhiForCausalLM",
"tokenizer_type": "AutoTokenizer",
"sequence_len": 512,
"sample_packing": False,
@@ -76,7 +76,7 @@ class TestPhi(unittest.TestCase):
{
"base_model": "microsoft/phi-1_5",
"trust_remote_code": True,
"model_type": "MixFormerSequentialForCausalLM",
"model_type": "PhiForCausalLM",
"tokenizer_type": "AutoTokenizer",
"sequence_len": 512,
"sample_packing": True,

View File

@@ -649,3 +649,33 @@ class ValidationTest(unittest.TestCase):
)
validate_config(cfg)
def test_warmup_step_no_conflict(self):
cfg = DictDefault(
{
"warmup_steps": 10,
"warmup_ratio": 0.1,
}
)
with pytest.raises(
ValueError,
match=r".*warmup_steps and warmup_ratio are mutually exclusive*",
):
validate_config(cfg)
cfg = DictDefault(
{
"warmup_steps": 10,
}
)
validate_config(cfg)
cfg = DictDefault(
{
"warmup_ratio": 0.1,
}
)
validate_config(cfg)