Compare commits

..

1 Commits

Author SHA1 Message Date
mhenrichsen
9084879861 tinyllama 2023-11-16 13:36:01 +00:00
19 changed files with 41 additions and 1431 deletions

View File

@@ -71,7 +71,6 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
pip3 install --extra-index-url https://download.pytorch.org/whl/cu118 -U torch==2.0.1
pip3 uninstall -y transformers accelerate pip3 uninstall -y transformers accelerate
pip3 install -U -e .[flash-attn] pip3 install -U -e .[flash-attn]
pip3 install -r requirements-tests.txt pip3 install -r requirements-tests.txt

View File

@@ -77,7 +77,6 @@ Features:
| XGen | ✅ | ❓ | ✅ | ❓ | ❓ | ❓ | ✅ | | XGen | ✅ | ❓ | ✅ | ❓ | ❓ | ❓ | ✅ |
| phi | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ | | phi | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
| RWKV | ✅ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ | | RWKV | ✅ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ |
| Qwen | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
## Quickstart ⚡ ## Quickstart ⚡
@@ -86,19 +85,14 @@ Get started with Axolotl in just a few steps! This quickstart guide will walk yo
**Requirements**: Python >=3.9 and Pytorch >=2.0. **Requirements**: Python >=3.9 and Pytorch >=2.0.
`pip3 install "axolotl[flash-attn,deepspeed] @ git+https://github.com/OpenAccess-AI-Collective/axolotl"`
### For developers
```bash ```bash
git clone https://github.com/OpenAccess-AI-Collective/axolotl git clone https://github.com/OpenAccess-AI-Collective/axolotl
cd axolotl cd axolotl
pip3 install packaging pip3 install packaging
pip3 install -e '.[flash-attn,deepspeed]' pip3 install -e '.[flash-attn,deepspeed]'
``` pip3 install -U git+https://github.com/huggingface/peft.git
### Usage
```bash
# finetune lora # finetune lora
accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
@@ -500,7 +494,6 @@ is_falcon_derived_model:
is_llama_derived_model: is_llama_derived_model:
# Please note that if you set this to true, `padding_side` will be set to "left" by default # Please note that if you set this to true, `padding_side` will be set to "left" by default
is_mistral_derived_model: is_mistral_derived_model:
is_qwen_derived_model:
# optional overrides to the base model configuration # optional overrides to the base model configuration
model_config: model_config:
@@ -545,8 +538,6 @@ datasets:
# Optional[str] fastchat conversation type, only used with type: sharegpt # Optional[str] fastchat conversation type, only used with type: sharegpt
conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
field_human: # Optional[str]. Human key to use for conversation.
field_model: # Optional[str]. Assistant key to use for conversation.
# Custom user prompt # Custom user prompt
- path: repo - path: repo
@@ -677,8 +668,7 @@ gradient_accumulation_steps: 1
micro_batch_size: 2 micro_batch_size: 2
eval_batch_size: eval_batch_size:
num_epochs: 4 num_epochs: 4
warmup_steps: 100 # cannot use with warmup_ratio warmup_steps: 100
warmup_ratio: 0.05 # cannot use with warmup_steps
learning_rate: 0.00003 learning_rate: 0.00003
lr_quadratic_warmup: lr_quadratic_warmup:
logging_steps: logging_steps:

View File

@@ -4,19 +4,20 @@ model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer tokenizer_type: LlamaTokenizer
is_llama_derived_model: true is_llama_derived_model: true
load_in_8bit: true load_in_8bit: false
load_in_4bit: false load_in_4bit: false
strict: false strict: false
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/context-aware-splits-english
type: alpaca type: alpaca
dataset_prepared_path: dataset_prepared_path:
val_set_size: 0.05 val_set_size: 200
output_dir: ./lora-out output_dir: ./tiny-llama
sequence_len: 4096 sequence_len: 8192
sample_packing: true sample_packing: true
pad_to_sequence_len: true
adapter: lora adapter: lora
lora_model_dir: lora_model_dir:
@@ -32,9 +33,9 @@ wandb_watch:
wandb_run_id: wandb_run_id:
wandb_log_model: wandb_log_model:
gradient_accumulation_steps: 4 gradient_accumulation_steps: 1
micro_batch_size: 2 micro_batch_size: 8
num_epochs: 4 num_epochs: 3
optimizer: adamw_bnb_8bit optimizer: adamw_bnb_8bit
lr_scheduler: cosine lr_scheduler: cosine
learning_rate: 0.0002 learning_rate: 0.0002
@@ -53,13 +54,13 @@ logging_steps: 1
xformers_attention: xformers_attention:
flash_attention: true flash_attention: true
warmup_steps: 10 warmup_steps: 50
eval_steps: 0.05 eval_steps: 0.05
eval_table_size: eval_table_size:
save_steps: save_steps: 0.50
debug: debug:
deepspeed: deepspeed:
weight_decay: 0.0 weight_decay: 0.1
fsdp: fsdp:
fsdp_config: fsdp_config:
special_tokens: special_tokens:

View File

@@ -1,5 +1,5 @@
base_model: microsoft/phi-1_5 base_model: microsoft/phi-1_5
model_type: PhiForCausalLM model_type: MixFormerSequentialForCausalLM
tokenizer_type: AutoTokenizer tokenizer_type: AutoTokenizer
is_llama_derived_model: false is_llama_derived_model: false
trust_remote_code: true trust_remote_code: true

View File

@@ -1,68 +0,0 @@
base_model: Qwen/Qwen-7B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
is_qwen_derived_model: true
trust_remote_code: true
load_in_8bit: true
load_in_4bit: false
strict: false
datasets:
- path: mhenrichsen/alpaca_2k_test
type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./lora-out
sequence_len: 2048 # supports up to 8192
sample_packing: false
pad_to_sequence_len:
adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:
wandb_project:
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false
gradient_checkpointing: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 10
eval_steps: 0.05
eval_table_size:
eval_table_max_new_tokens: 128
save_steps:
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:

View File

@@ -1,68 +0,0 @@
base_model: Qwen/Qwen-7B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
is_qwen_derived_model: true
trust_remote_code: true
load_in_8bit: false
load_in_4bit: true
strict: false
datasets:
- path: mhenrichsen/alpaca_2k_test
type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./lora-out
sequence_len: 2048 # supports up to 8192
sample_packing: false
pad_to_sequence_len:
adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:
wandb_project:
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: true
fp16: false
tf32: false
gradient_checkpointing: false
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
warmup_steps: 10
eval_steps: 0.05
eval_table_size:
eval_table_max_new_tokens: 128
save_steps:
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:

View File

@@ -1,20 +1,22 @@
--extra-index-url https://download.pytorch.org/whl/cu118
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
auto-gptq==0.5.1 torch==2.0.1
auto-gptq==0.4.2
packaging packaging
peft==0.6.0 peft==0.6.0
transformers==4.35.1 transformers @ git+https://github.com/huggingface/transformers.git@acc394c4f5e1283c19783581790b3dc3105a3697
bitsandbytes>=0.41.1 bitsandbytes>=0.41.1
accelerate==0.24.1 accelerate @ git+https://github.com/huggingface/accelerate@80da9cfb09bb3cc9f1b385cb55d6b90d025a5fd9
deepspeed deepspeed
addict addict
fire fire
PyYAML>=6.0 PyYAML>=6.0
datasets>=2.14.0 datasets>=2.14.0
flash-attn==2.3.3 flash-attn>=2.3.0
sentencepiece sentencepiece
wandb wandb
einops einops
xformers==0.0.22 xformers>=0.0.22
optimum==1.13.2 optimum==1.13.2
hf_transfer hf_transfer
colorama colorama

View File

@@ -461,14 +461,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
return AxolotlTrainer return AxolotlTrainer
def build(self, total_num_steps): def build(self, total_num_steps):
warmup_steps = None warmup_steps = (
if self.cfg.warmup_steps is not None: self.cfg.warmup_steps
warmup_steps = self.cfg.warmup_steps if self.cfg.warmup_steps is not None
elif self.cfg.warmup_ratio is not None: else min(int(0.03 * total_num_steps), 100)
warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0) )
else:
warmup_steps = min(int(0.03 * total_num_steps), 100)
logging_steps = ( logging_steps = (
self.cfg.logging_steps self.cfg.logging_steps
if self.cfg.logging_steps is not None if self.cfg.logging_steps is not None
@@ -661,9 +658,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
self.cfg.sample_packing if self.cfg.sample_packing else False self.cfg.sample_packing if self.cfg.sample_packing else False
) )
training_arguments_kwargs["eval_sample_packing"] = ( training_arguments_kwargs["eval_sample_packing"] = (
self.cfg.sample_packing self.cfg.sample_packing if self.cfg.sample_packing else False
if self.cfg.eval_sample_packing is not False
else False
) )
training_arguments_kwargs[ training_arguments_kwargs[
"sample_packing_seq_len_multiplier" "sample_packing_seq_len_multiplier"

View File

@@ -3,6 +3,4 @@ MixFormers model architecture used for phi models
""" """
from .configuration_mixformer_sequential import MixFormerSequentialConfig # noqa from .configuration_mixformer_sequential import MixFormerSequentialConfig # noqa
from .configuration_phi import PhiConfig # noqa
from .modeling_mixformer_sequential import MixFormerSequentialForCausalLM # noqa from .modeling_mixformer_sequential import MixFormerSequentialForCausalLM # noqa
from .modeling_phi import PhiForCausalLM # noqa

View File

@@ -1,65 +0,0 @@
# pylint: skip-file
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import math
from typing import Optional
from transformers import PretrainedConfig
class PhiConfig(PretrainedConfig):
"""Phi configuration."""
model_type = "phi"
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
def __init__(
self,
vocab_size: int = 50304,
n_positions: int = 2048,
n_embd: int = 1024,
n_layer: int = 20,
n_inner: Optional[int] = None,
n_head: int = 16,
n_head_kv: Optional[int] = None,
rotary_dim: Optional[int] = 32,
activation_function: Optional[str] = "gelu_new",
flash_attn: bool = False,
flash_rotary: bool = False,
fused_dense: bool = False,
attn_pdrop: float = 0.0,
embd_pdrop: float = 0.0,
resid_pdrop: float = 0.0,
layer_norm_epsilon: float = 1e-5,
initializer_range: float = 0.02,
tie_word_embeddings: bool = False,
pad_vocab_size_multiple: int = 64,
**kwargs
) -> None:
self.vocab_size = int(
math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
)
self.n_positions = n_positions
self.n_embd = n_embd
self.n_layer = n_layer
self.n_inner = n_inner
self.n_head = n_head
self.n_head_kv = n_head_kv
self.rotary_dim = min(rotary_dim, n_embd // n_head)
self.activation_function = activation_function
self.flash_attn = flash_attn
self.flash_rotary = flash_rotary
self.fused_dense = fused_dense
self.attn_pdrop = attn_pdrop
self.embd_pdrop = embd_pdrop
self.resid_pdrop = resid_pdrop
self.layer_norm_epsilon = layer_norm_epsilon
self.initializer_range = initializer_range
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,6 @@
""" """
Basic completion text Basic completion text
""" """
import json
from collections import defaultdict from collections import defaultdict
from typing import Any, Dict, Generator, Optional, Tuple from typing import Any, Dict, Generator, Optional, Tuple
@@ -65,19 +64,6 @@ class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
return next(iter(self.prompter.build_prompt(instruction, input, response))) return next(iter(self.prompter.build_prompt(instruction, input, response)))
class CompletionJSONPromptTokenizationStrategy(CompletionPromptTokenizingStrategy):
"""
Strategy to return the stringified JSON of the entire row as the training data
"""
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
return (
json.dumps(prompt),
"",
"",
)
class CompletionPrompter: class CompletionPrompter:
""" """
Prompter for completion Prompter for completion
@@ -96,7 +82,7 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
strat = CompletionPromptTokenizingStrategy( strat = CompletionPromptTokenizingStrategy(
CompletionPrompter(), CompletionPrompter(),
tokenizer, tokenizer,
True, cfg.train_on_inputs,
cfg.sequence_len, cfg.sequence_len,
max_length=cfg.sequence_len * 64, max_length=cfg.sequence_len * 64,
) )
@@ -104,15 +90,3 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
strat.field = ds_cfg["field"] strat.field = ds_cfg["field"]
return strat return strat
def load_json(tokenizer, cfg):
strat = CompletionJSONPromptTokenizationStrategy(
CompletionPrompter(),
tokenizer,
True,
cfg.sequence_len,
max_length=cfg.sequence_len * 64,
)
return strat

View File

@@ -122,19 +122,6 @@ def normalize_config(cfg):
or (cfg.model_type and "mistral" in cfg.model_type.lower()) or (cfg.model_type and "mistral" in cfg.model_type.lower())
) )
cfg.is_qwen_derived_model = (
(
hasattr(model_config, "model_type")
and model_config.model_type
in [
"qwen",
]
)
or cfg.is_qwen_derived_model
or "qwen" in cfg.base_model.lower()
or (cfg.model_type and "qwen" in cfg.model_type.lower())
)
if isinstance(cfg.learning_rate, str): if isinstance(cfg.learning_rate, str):
cfg.learning_rate = float(cfg.learning_rate) cfg.learning_rate = float(cfg.learning_rate)
@@ -178,11 +165,7 @@ def validate_config(cfg):
"batch_size is not recommended. Please use gradient_accumulation_steps instead.", "batch_size is not recommended. Please use gradient_accumulation_steps instead.",
"To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.", "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
) )
if ( if cfg.eval_batch_size != cfg.micro_batch_size:
cfg.eval_batch_size
and cfg.micro_batch_size
and cfg.eval_batch_size != cfg.micro_batch_size
):
LOG.warning( LOG.warning(
"eval_batch_size != micro_batch_size. This can lead to VRAM instability." "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
) )
@@ -389,14 +372,6 @@ def validate_config(cfg):
if cfg.rope_scaling: if cfg.rope_scaling:
LOG.warning("`rope_scaling` should now be be a key under `model_config`") LOG.warning("`rope_scaling` should now be be a key under `model_config`")
if cfg.warmup_steps and cfg.warmup_ratio:
raise ValueError("warmup_steps and warmup_ratio are mutually exclusive")
if cfg.is_qwen_derived_model and cfg.gradient_checkpointing:
LOG.warning(
"Gradient checkpointing is broken for Qwen models for transformers>=4.35.0, except main branch."
)
# TODO # TODO
# MPT 7b # MPT 7b
# https://github.com/facebookresearch/bitsandbytes/issues/25 # https://github.com/facebookresearch/bitsandbytes/issues/25

View File

@@ -79,14 +79,6 @@ def prepare_dataset(cfg, tokenizer):
train_dataset, eval_dataset = process_datasets_for_packing( train_dataset, eval_dataset = process_datasets_for_packing(
cfg, train_dataset, eval_dataset, tokenizer cfg, train_dataset, eval_dataset, tokenizer
) )
if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
if total_eval_steps == 0:
raise ValueError(
"eval dataset split is too small for sample_packing. You should set `eval_sample_packing: False`. "
)
if cfg.max_steps: if cfg.max_steps:
total_num_steps = min( total_num_steps = min(
calculate_total_num_steps(cfg, train_dataset), cfg.max_steps calculate_total_num_steps(cfg, train_dataset), cfg.max_steps
@@ -242,14 +234,7 @@ def load_tokenized_prepared_datasets(
local_path = Path(config_dataset.path) local_path = Path(config_dataset.path)
if local_path.exists(): if local_path.exists():
if local_path.is_dir(): if local_path.is_dir():
# TODO dirs with arrow or parquet files could be loaded with `load_from_disk` ds = load_from_disk(config_dataset.path)
ds = load_dataset(
config_dataset.path,
name=config_dataset.name,
data_files=config_dataset.data_files,
streaming=False,
split=None,
)
elif local_path.is_file(): elif local_path.is_file():
ds_type = get_ds_type(config_dataset) ds_type = get_ds_type(config_dataset)

View File

@@ -84,18 +84,6 @@ def load_tokenizer(cfg):
if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing: if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
tokenizer.padding_side = "left" tokenizer.padding_side = "left"
# Qwen base only has single token, so we need to set the special tokens
if cfg.is_qwen_derived_model:
token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
for attr_name in token_ids:
if getattr(tokenizer, attr_name) is None:
setattr(tokenizer, attr_name, tokenizer.eod_id)
token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
for attr_name in token_names:
if getattr(tokenizer, attr_name) is None:
setattr(tokenizer, attr_name, "<|endoftext|>")
if cfg.special_tokens: if cfg.special_tokens:
for k, val in cfg.special_tokens.items(): for k, val in cfg.special_tokens.items():
tokenizer.add_special_tokens( tokenizer.add_special_tokens(
@@ -300,10 +288,10 @@ def load_model(
# device=cfg.device, # device=cfg.device,
# ) # )
# model.train() # sets to train instead of eval mode # model.train() # sets to train instead of eval mode
elif model_type == "PhiForCausalLM": elif model_type == "MixFormerSequentialForCausalLM":
from axolotl.models.phi import PhiForCausalLM from axolotl.models.phi import MixFormerSequentialForCausalLM
model = PhiForCausalLM.from_pretrained( model = MixFormerSequentialForCausalLM.from_pretrained(
base_model, base_model,
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None, load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None, load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,

View File

@@ -182,7 +182,7 @@ class MultipackBatchSampler(BatchSampler):
# shave off 1% + 1 for dealing with variance in packing from random sampler to sampler # shave off 1% + 1 for dealing with variance in packing from random sampler to sampler
return max( return max(
0, 1,
( (
world_size world_size
* math.floor( * math.floor(

View File

@@ -141,7 +141,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
return train_dataset, eval_dataset return train_dataset, eval_dataset
def calculate_total_num_steps(cfg, train_dataset, update=True): def calculate_total_num_steps(cfg, train_dataset):
if not cfg.total_num_tokens: if not cfg.total_num_tokens:
total_num_tokens = np.sum( total_num_tokens = np.sum(
train_dataset.data.column("input_ids") train_dataset.data.column("input_ids")
@@ -150,8 +150,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
.values .values
) )
LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True) LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True)
if update: cfg.total_num_tokens = total_num_tokens
cfg.total_num_tokens = total_num_tokens
if not cfg.total_supervised_tokens: if not cfg.total_supervised_tokens:
total_supervised_tokens = ( total_supervised_tokens = (
@@ -164,8 +163,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
f"`total_supervised_tokens: {total_supervised_tokens}`", f"`total_supervised_tokens: {total_supervised_tokens}`",
main_process_only=True, main_process_only=True,
) )
if update: cfg.total_supervised_tokens = total_supervised_tokens
cfg.total_supervised_tokens = total_supervised_tokens
if cfg.sample_packing: if cfg.sample_packing:
# we have to drop anything longer then sequence len otherwise # we have to drop anything longer then sequence len otherwise
@@ -234,8 +232,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
sample_packing_eff_est = ( sample_packing_eff_est = (
math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0 math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
) )
if update: cfg.sample_packing_eff_est = sample_packing_eff_est
cfg.sample_packing_eff_est = sample_packing_eff_est
LOG.debug( LOG.debug(
f"sample_packing_eff_est: {cfg.sample_packing_eff_est}", f"sample_packing_eff_est: {cfg.sample_packing_eff_est}",
main_process_only=True, main_process_only=True,

View File

@@ -31,7 +31,7 @@ class TestPhi(unittest.TestCase):
{ {
"base_model": "microsoft/phi-1_5", "base_model": "microsoft/phi-1_5",
"trust_remote_code": True, "trust_remote_code": True,
"model_type": "PhiForCausalLM", "model_type": "MixFormerSequentialForCausalLM",
"tokenizer_type": "AutoTokenizer", "tokenizer_type": "AutoTokenizer",
"sequence_len": 512, "sequence_len": 512,
"sample_packing": False, "sample_packing": False,
@@ -76,7 +76,7 @@ class TestPhi(unittest.TestCase):
{ {
"base_model": "microsoft/phi-1_5", "base_model": "microsoft/phi-1_5",
"trust_remote_code": True, "trust_remote_code": True,
"model_type": "PhiForCausalLM", "model_type": "MixFormerSequentialForCausalLM",
"tokenizer_type": "AutoTokenizer", "tokenizer_type": "AutoTokenizer",
"sequence_len": 512, "sequence_len": 512,
"sample_packing": True, "sample_packing": True,

View File

@@ -649,33 +649,3 @@ class ValidationTest(unittest.TestCase):
) )
validate_config(cfg) validate_config(cfg)
def test_warmup_step_no_conflict(self):
cfg = DictDefault(
{
"warmup_steps": 10,
"warmup_ratio": 0.1,
}
)
with pytest.raises(
ValueError,
match=r".*warmup_steps and warmup_ratio are mutually exclusive*",
):
validate_config(cfg)
cfg = DictDefault(
{
"warmup_steps": 10,
}
)
validate_config(cfg)
cfg = DictDefault(
{
"warmup_ratio": 0.1,
}
)
validate_config(cfg)