Compare commits
8 Commits
datasets-r
...
autogptq-t
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0026fcc3df | ||
|
|
b448c77148 | ||
|
|
c820d04669 | ||
|
|
588cd65a64 | ||
|
|
caa80e891d | ||
|
|
ac37753aa2 | ||
|
|
a29560004b | ||
|
|
1deb767fe8 |
@@ -163,8 +163,6 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
|||||||
```
|
```
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
- Windows: Please use WSL or Docker!
|
|
||||||
|
|
||||||
### Dataset
|
### Dataset
|
||||||
|
|
||||||
Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
|
Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
|
||||||
@@ -625,11 +623,6 @@ fsdp_config:
|
|||||||
# Deepspeed config path
|
# Deepspeed config path
|
||||||
deepspeed:
|
deepspeed:
|
||||||
|
|
||||||
# Advanced DDP Arguments
|
|
||||||
ddp_timeout:
|
|
||||||
ddp_bucket_cap_mb:
|
|
||||||
ddp_broadcast_buffers:
|
|
||||||
|
|
||||||
# Path to torch distx for optim 'adamw_anyprecision'
|
# Path to torch distx for optim 'adamw_anyprecision'
|
||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,10 @@
|
|||||||
"type": "AdamW",
|
"type": "AdamW",
|
||||||
"params": {
|
"params": {
|
||||||
"lr": "auto",
|
"lr": "auto",
|
||||||
"betas": "auto",
|
"betas": [
|
||||||
|
0.9,
|
||||||
|
0.95
|
||||||
|
],
|
||||||
"eps": 1e-8,
|
"eps": 1e-8,
|
||||||
"weight_decay": "auto"
|
"weight_decay": "auto"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ transformers @ git+https://github.com/huggingface/transformers.git
|
|||||||
bitsandbytes>=0.41.1
|
bitsandbytes>=0.41.1
|
||||||
accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
|
accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
|
||||||
addict
|
addict
|
||||||
evaluate
|
|
||||||
fire
|
fire
|
||||||
PyYAML>=6.0
|
PyYAML>=6.0
|
||||||
datasets
|
datasets
|
||||||
|
|||||||
@@ -4,7 +4,9 @@ import importlib
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import signal
|
||||||
import sys
|
import sys
|
||||||
|
from dataclasses import dataclass, field
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
@@ -15,17 +17,17 @@ import yaml
|
|||||||
|
|
||||||
# add src to the pythonpath so we don't need to pip install this
|
# add src to the pythonpath so we don't need to pip install this
|
||||||
from art import text2art
|
from art import text2art
|
||||||
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from transformers import GenerationConfig, TextStreamer
|
from transformers import GenerationConfig, TextStreamer
|
||||||
|
|
||||||
from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
|
|
||||||
from axolotl.logging_config import configure_logging
|
from axolotl.logging_config import configure_logging
|
||||||
from axolotl.train import TrainDatasetMeta, train
|
|
||||||
from axolotl.utils.config import normalize_config, validate_config
|
from axolotl.utils.config import normalize_config, validate_config
|
||||||
from axolotl.utils.data import prepare_dataset
|
from axolotl.utils.data import prepare_dataset
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.distributed import is_main_process
|
from axolotl.utils.distributed import is_main_process
|
||||||
from axolotl.utils.models import load_tokenizer
|
from axolotl.utils.models import load_model, load_model_config, load_tokenizer
|
||||||
from axolotl.utils.tokenization import check_dataset_labels
|
from axolotl.utils.tokenization import check_dataset_labels
|
||||||
|
from axolotl.utils.trainer import setup_trainer
|
||||||
from axolotl.utils.wandb import setup_wandb_env_vars
|
from axolotl.utils.wandb import setup_wandb_env_vars
|
||||||
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
@@ -38,13 +40,26 @@ LOG = logging.getLogger("axolotl.scripts")
|
|||||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TrainerCliArgs:
|
||||||
|
"""
|
||||||
|
dataclass representing the various non-training arguments
|
||||||
|
"""
|
||||||
|
|
||||||
|
debug: bool = field(default=False)
|
||||||
|
inference: bool = field(default=False)
|
||||||
|
merge_lora: bool = field(default=False)
|
||||||
|
prepare_ds_only: bool = field(default=False)
|
||||||
|
prompter: Optional[str] = field(default=None)
|
||||||
|
shard: bool = field(default=False)
|
||||||
|
|
||||||
|
|
||||||
def print_axolotl_text_art(suffix=None):
|
def print_axolotl_text_art(suffix=None):
|
||||||
font = "nancyj"
|
font = "nancyj"
|
||||||
ascii_text = " axolotl"
|
ascii_text = " axolotl"
|
||||||
if suffix:
|
if suffix:
|
||||||
ascii_text += f" x {suffix}"
|
ascii_text += f" x {suffix}"
|
||||||
ascii_art = text2art(" axolotl", font=font)
|
ascii_art = text2art(" axolotl", font=font)
|
||||||
|
|
||||||
if is_main_process():
|
if is_main_process():
|
||||||
print(ascii_art)
|
print(ascii_art)
|
||||||
|
|
||||||
@@ -58,45 +73,9 @@ def get_multi_line_input() -> Optional[str]:
|
|||||||
return instruction
|
return instruction
|
||||||
|
|
||||||
|
|
||||||
def do_merge_lora(
|
def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
|
||||||
*,
|
if prompter == "None":
|
||||||
cfg: DictDefault,
|
prompter = None
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
|
||||||
safe_serialization = cfg.save_safetensors is True
|
|
||||||
|
|
||||||
LOG.info("running merge of LoRA with base model")
|
|
||||||
model = model.merge_and_unload()
|
|
||||||
model.to(dtype=torch.float16)
|
|
||||||
|
|
||||||
if cfg.local_rank == 0:
|
|
||||||
LOG.info("saving merged model")
|
|
||||||
model.save_pretrained(
|
|
||||||
str(Path(cfg.output_dir) / "merged"),
|
|
||||||
safe_serialization=safe_serialization,
|
|
||||||
)
|
|
||||||
tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
|
|
||||||
|
|
||||||
|
|
||||||
def shard(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
|
||||||
safe_serialization = cfg.save_safetensors is True
|
|
||||||
LOG.debug("Re-saving model w/ sharding")
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
|
||||||
|
|
||||||
|
|
||||||
def do_inference(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
|
||||||
prompter = cli_args.prompter
|
|
||||||
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||||
|
|
||||||
for token, symbol in default_tokens.items():
|
for token, symbol in default_tokens.items():
|
||||||
@@ -197,6 +176,141 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b
|
|||||||
return not any(el in list2 for el in list1)
|
return not any(el in list2 for el in list1)
|
||||||
|
|
||||||
|
|
||||||
|
def train(
|
||||||
|
*,
|
||||||
|
cfg: DictDefault,
|
||||||
|
cli_args: TrainerCliArgs,
|
||||||
|
):
|
||||||
|
# load the tokenizer first
|
||||||
|
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
||||||
|
tokenizer = load_tokenizer(cfg)
|
||||||
|
|
||||||
|
if not (
|
||||||
|
cli_args.shard or cli_args.merge_lora or cli_args.inference
|
||||||
|
): # don't need to load dataset for these
|
||||||
|
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
|
||||||
|
|
||||||
|
if cli_args.debug or cfg.debug:
|
||||||
|
LOG.info("check_dataset_labels...")
|
||||||
|
check_dataset_labels(
|
||||||
|
train_dataset.select(
|
||||||
|
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
|
||||||
|
),
|
||||||
|
tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
|
if cli_args.prepare_ds_only:
|
||||||
|
LOG.info("Finished preparing dataset. Exiting...")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Load the model and tokenizer
|
||||||
|
LOG.info("loading model and (optionally) peft_config...")
|
||||||
|
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||||
|
|
||||||
|
safe_serialization = cfg.save_safetensors is True
|
||||||
|
|
||||||
|
if cli_args.merge_lora and cfg.adapter is not None:
|
||||||
|
LOG.info("running merge of LoRA with base model")
|
||||||
|
model = model.merge_and_unload()
|
||||||
|
model.to(dtype=torch.float16)
|
||||||
|
|
||||||
|
if cfg.local_rank == 0:
|
||||||
|
LOG.info("saving merged model")
|
||||||
|
model.save_pretrained(
|
||||||
|
str(Path(cfg.output_dir) / "merged"),
|
||||||
|
safe_serialization=safe_serialization,
|
||||||
|
)
|
||||||
|
tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
|
||||||
|
return
|
||||||
|
|
||||||
|
if cli_args.inference:
|
||||||
|
LOG.debug("Running inference on model")
|
||||||
|
do_inference(cfg, model, tokenizer, prompter=cli_args.prompter)
|
||||||
|
return
|
||||||
|
|
||||||
|
if cli_args.shard:
|
||||||
|
LOG.debug("Re-saving model w/ sharding")
|
||||||
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
return
|
||||||
|
|
||||||
|
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
||||||
|
possible_checkpoints = [
|
||||||
|
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
||||||
|
]
|
||||||
|
if len(possible_checkpoints) > 0:
|
||||||
|
sorted_paths = sorted(
|
||||||
|
possible_checkpoints,
|
||||||
|
key=lambda path: int(path.split("-")[-1]),
|
||||||
|
)
|
||||||
|
cfg.resume_from_checkpoint = sorted_paths[-1]
|
||||||
|
LOG.info(
|
||||||
|
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
|
||||||
|
)
|
||||||
|
resume_from_checkpoint = cfg.resume_from_checkpoint
|
||||||
|
|
||||||
|
trainer = setup_trainer(
|
||||||
|
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
||||||
|
)
|
||||||
|
|
||||||
|
model.config.use_cache = False
|
||||||
|
|
||||||
|
if torch.__version__ >= "2" and sys.platform != "win32":
|
||||||
|
LOG.info("Compiling torch model")
|
||||||
|
model = torch.compile(model)
|
||||||
|
|
||||||
|
# go ahead and presave, so we have the adapter config available to inspect
|
||||||
|
if peft_config:
|
||||||
|
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
||||||
|
peft_config.save_pretrained(cfg.output_dir)
|
||||||
|
|
||||||
|
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
||||||
|
if cfg.local_rank == 0:
|
||||||
|
|
||||||
|
def terminate_handler(_, __, model):
|
||||||
|
if cfg.flash_optimum:
|
||||||
|
model = BetterTransformer.reverse(model)
|
||||||
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
signal.signal(
|
||||||
|
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
|
||||||
|
)
|
||||||
|
|
||||||
|
LOG.info("Starting trainer...")
|
||||||
|
if cfg.group_by_length:
|
||||||
|
LOG.info("hang tight... sorting dataset for group_by_length")
|
||||||
|
|
||||||
|
if not Path(cfg.output_dir).is_dir():
|
||||||
|
os.makedirs(cfg.output_dir, exist_ok=True)
|
||||||
|
tokenizer.save_pretrained(cfg.output_dir)
|
||||||
|
if cfg.flash_optimum:
|
||||||
|
with torch.backends.cuda.sdp_kernel(
|
||||||
|
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
||||||
|
):
|
||||||
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||||
|
else:
|
||||||
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||||
|
|
||||||
|
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
||||||
|
|
||||||
|
if cfg.relora_steps:
|
||||||
|
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
||||||
|
model = model.merge_and_unload()
|
||||||
|
else:
|
||||||
|
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
||||||
|
return
|
||||||
|
|
||||||
|
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
||||||
|
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
||||||
|
if cfg.fsdp:
|
||||||
|
trainer.save_model(cfg.output_dir)
|
||||||
|
elif cfg.local_rank == 0:
|
||||||
|
if cfg.flash_optimum:
|
||||||
|
model = BetterTransformer.reverse(model)
|
||||||
|
|
||||||
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
|
||||||
|
|
||||||
def load_cfg(config: Path = Path("examples/"), **kwargs):
|
def load_cfg(config: Path = Path("examples/"), **kwargs):
|
||||||
if Path(config).is_dir():
|
if Path(config).is_dir():
|
||||||
config = choose_config(config)
|
config = choose_config(config)
|
||||||
@@ -216,6 +330,15 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):
|
|||||||
else:
|
else:
|
||||||
cfg[k] = kwargs[k]
|
cfg[k] = kwargs[k]
|
||||||
|
|
||||||
|
model_config = load_model_config(cfg)
|
||||||
|
|
||||||
|
# figure out if the model is llama
|
||||||
|
cfg.is_llama_derived_model = (
|
||||||
|
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
|
||||||
|
or cfg.is_llama_derived_model
|
||||||
|
or "llama" in cfg.base_model
|
||||||
|
or (cfg.model_type and "llama" in cfg.model_type.lower())
|
||||||
|
)
|
||||||
validate_config(cfg)
|
validate_config(cfg)
|
||||||
|
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
@@ -224,55 +347,15 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):
|
|||||||
return cfg
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
def load_datasets(
|
def do_train(config: Path = Path("examples/"), **kwargs):
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
) -> TrainDatasetMeta:
|
|
||||||
tokenizer = load_tokenizer(cfg)
|
|
||||||
|
|
||||||
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
|
|
||||||
|
|
||||||
if cli_args.debug or cfg.debug:
|
|
||||||
LOG.info("check_dataset_labels...")
|
|
||||||
check_dataset_labels(
|
|
||||||
train_dataset.select(
|
|
||||||
[
|
|
||||||
random.randrange(0, len(train_dataset) - 1) # nosec
|
|
||||||
for _ in range(cli_args.debug_num_examples)
|
|
||||||
]
|
|
||||||
),
|
|
||||||
tokenizer,
|
|
||||||
num_examples=cli_args.debug_num_examples,
|
|
||||||
text_only=cli_args.debug_text_only,
|
|
||||||
)
|
|
||||||
|
|
||||||
return TrainDatasetMeta(
|
|
||||||
train_dataset=train_dataset,
|
|
||||||
eval_dataset=eval_dataset,
|
|
||||||
total_num_steps=total_num_steps,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def do_cli(config: Path = Path("examples/"), **kwargs):
|
|
||||||
print_axolotl_text_art()
|
print_axolotl_text_art()
|
||||||
parsed_cfg = load_cfg(config, **kwargs)
|
parsed_cfg = load_cfg(config, **kwargs)
|
||||||
parser = transformers.HfArgumentParser((TrainerCliArgs))
|
parser = transformers.HfArgumentParser((TrainerCliArgs))
|
||||||
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
|
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
|
||||||
return_remaining_strings=True
|
return_remaining_strings=True
|
||||||
)
|
)
|
||||||
if parsed_cli_args.inference:
|
train(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||||
do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
|
||||||
elif parsed_cli_args.merge_lora:
|
|
||||||
do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
|
||||||
elif parsed_cli_args.shard:
|
|
||||||
shard(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
|
||||||
else:
|
|
||||||
dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
|
||||||
if parsed_cli_args.prepare_ds_only:
|
|
||||||
return
|
|
||||||
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
fire.Fire(do_cli)
|
fire.Fire(do_train)
|
||||||
|
|||||||
@@ -1,43 +0,0 @@
|
|||||||
"""
|
|
||||||
shared module for cli specific things
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from axolotl.logging_config import configure_logging
|
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
from axolotl.utils.models import load_model, load_tokenizer
|
|
||||||
|
|
||||||
configure_logging()
|
|
||||||
LOG = logging.getLogger("axolotl.common.cli")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TrainerCliArgs:
|
|
||||||
"""
|
|
||||||
dataclass representing the various non-training arguments
|
|
||||||
"""
|
|
||||||
|
|
||||||
debug: bool = field(default=False)
|
|
||||||
debug_text_only: bool = field(default=False)
|
|
||||||
debug_num_examples: int = field(default=5)
|
|
||||||
inference: bool = field(default=False)
|
|
||||||
merge_lora: bool = field(default=False)
|
|
||||||
prepare_ds_only: bool = field(default=False)
|
|
||||||
prompter: Optional[str] = field(default=None)
|
|
||||||
shard: bool = field(default=False)
|
|
||||||
|
|
||||||
|
|
||||||
def load_model_and_tokenizer(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
|
||||||
tokenizer = load_tokenizer(cfg)
|
|
||||||
LOG.info("loading model and (optionally) peft_config...")
|
|
||||||
model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)
|
|
||||||
|
|
||||||
return model, tokenizer
|
|
||||||
@@ -1,144 +0,0 @@
|
|||||||
import logging
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from enum import Enum
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Dict, Generator, List, Optional, Union
|
|
||||||
|
|
||||||
from datasets import Dataset as Dataset_ds
|
|
||||||
from datasets import DatasetDict, IterableDataset, load_dataset, load_from_disk
|
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
|
|
||||||
logger = logging.getLogger("axolotl")
|
|
||||||
|
|
||||||
|
|
||||||
class DsType(Enum):
|
|
||||||
JSON = "json"
|
|
||||||
ARROW = "arrow"
|
|
||||||
PARQUET = "parquet"
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class DatasetConfiguration:
|
|
||||||
path: str
|
|
||||||
type: str
|
|
||||||
name: Optional[str] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "the name of the dataset configuration to load."},
|
|
||||||
)
|
|
||||||
ds_type: Optional[DsType] = None
|
|
||||||
data_files: Optional[Union[str, List[str]]] = None
|
|
||||||
shards: Optional[int] = None
|
|
||||||
test_size: Optional[float] = None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_dict(d: Dict[str, Any]) -> Generator["DatasetConfiguration", None, None]:
|
|
||||||
if "name" in d and isinstance(d["name"], list):
|
|
||||||
name = d.pop("name")
|
|
||||||
for n in name:
|
|
||||||
yield DatasetConfiguration(
|
|
||||||
**d,
|
|
||||||
name=n,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_dataset_from_local(config: DatasetConfiguration) -> Optional[Dataset_ds]:
|
|
||||||
local_path = Path(config.path)
|
|
||||||
if not local_path.exists():
|
|
||||||
return None
|
|
||||||
ds = None
|
|
||||||
if local_path.is_dir():
|
|
||||||
if config.ds_type:
|
|
||||||
# TODO dirs with arrow or parquet files could be loaded with `load_from_disk`
|
|
||||||
ds = load_from_disk(config.path)
|
|
||||||
else:
|
|
||||||
ds = load_dataset(
|
|
||||||
config.path,
|
|
||||||
name=config.name,
|
|
||||||
data_files=config.data_files,
|
|
||||||
streaming=False,
|
|
||||||
split=None,
|
|
||||||
)
|
|
||||||
elif local_path.is_file():
|
|
||||||
ds_type = "json"
|
|
||||||
if config.ds_type:
|
|
||||||
ds_type = config.ds_type.value
|
|
||||||
elif "parquet" in config.path:
|
|
||||||
ds_type = "parquet"
|
|
||||||
elif "arrow" in config.path:
|
|
||||||
ds_type = "arrow"
|
|
||||||
ds = load_dataset(
|
|
||||||
ds_type,
|
|
||||||
name=config.name,
|
|
||||||
data_files=config.path,
|
|
||||||
streaming=False,
|
|
||||||
split=None, # is this correct?
|
|
||||||
)
|
|
||||||
if not ds:
|
|
||||||
raise ValueError(
|
|
||||||
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
|
||||||
)
|
|
||||||
return ds
|
|
||||||
|
|
||||||
|
|
||||||
# TODO should this be a DatasetDict?
|
|
||||||
class Dataset(Dataset_ds):
|
|
||||||
_config: DatasetConfiguration
|
|
||||||
|
|
||||||
def __init__(self, *args, config: DatasetConfiguration = None, **kwargs):
|
|
||||||
self._config = config
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def from_config(
|
|
||||||
config: DatasetConfiguration,
|
|
||||||
token: bool = False,
|
|
||||||
default_test_size: float = 0.1,
|
|
||||||
):
|
|
||||||
ds = load_dataset_from_local(config)
|
|
||||||
if not ds:
|
|
||||||
try:
|
|
||||||
ds = load_dataset(
|
|
||||||
config.path,
|
|
||||||
name=config.name,
|
|
||||||
data_files=config.data_files,
|
|
||||||
token=token,
|
|
||||||
)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
if not ds:
|
|
||||||
fp = hf_hub_download(
|
|
||||||
repo_id=config.path,
|
|
||||||
repo_type="dataset",
|
|
||||||
filename=config.data_files,
|
|
||||||
token=token,
|
|
||||||
)
|
|
||||||
ds = load_dataset(
|
|
||||||
"json", name=config.name, data_files=fp, streaming=False, split=None
|
|
||||||
)
|
|
||||||
if not ds:
|
|
||||||
raise ValueError("unhandled dataset load")
|
|
||||||
test_size = config.test_size if config.test_size else default_test_size
|
|
||||||
# determine if the dataset is pre-tokenized
|
|
||||||
check_ds = ds["train"] if isinstance(ds, DatasetDict) and "train" in ds else ds
|
|
||||||
is_ds_tokenized = False
|
|
||||||
if "input_ids" in check_ds.features:
|
|
||||||
is_ds_tokenized = True
|
|
||||||
if "attention_mask" not in check_ds.features:
|
|
||||||
logger.warning("`attention_mask` missing from pre-tokenized dataset")
|
|
||||||
if "labels" not in check_ds.features:
|
|
||||||
logger.warning("`labels` missing from pre-tokenized dataset")
|
|
||||||
if test_size and (not isinstance(ds, DatasetDict) or "test" not in ds):
|
|
||||||
ds.train_test_split(test_size=test_size, shuffle=False)
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class DatasetCollection:
|
|
||||||
datasets: List[Dataset] = []
|
|
||||||
|
|
||||||
def __init__(self, datasets: Union[Dataset, List[Dataset]]):
|
|
||||||
self.datasets = datasets if isinstance(datasets, list) else [datasets]
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for ds in self.datasets:
|
|
||||||
for d in ds:
|
|
||||||
yield d
|
|
||||||
@@ -2,9 +2,7 @@
|
|||||||
|
|
||||||
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
|
# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
|
||||||
|
|
||||||
import logging
|
|
||||||
import warnings
|
import warnings
|
||||||
from functools import partial
|
|
||||||
from typing import List, Optional, Tuple, Union
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -35,9 +33,6 @@ except ImportError:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
|
||||||
|
|
||||||
|
|
||||||
def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
||||||
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
|
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
|
||||||
_prepare_decoder_attention_mask
|
_prepare_decoder_attention_mask
|
||||||
@@ -49,34 +44,6 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
|||||||
llama_model_forward
|
llama_model_forward
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
|
||||||
from flash_attn.losses.cross_entropy import CrossEntropyLoss
|
|
||||||
|
|
||||||
LOG.info("patching with flash_attn.losses.cross_entropy")
|
|
||||||
transformers.models.llama.modeling_llama.CrossEntropyLoss = partial(
|
|
||||||
CrossEntropyLoss, inplace_backward=True
|
|
||||||
)
|
|
||||||
except ImportError:
|
|
||||||
LOG.info(
|
|
||||||
"optimized flash-attention CrossEntropyLoss not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=xentropy_cuda_lib&subdirectory=csrc/xentropy'`)"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from flash_attn.ops.rms_norm import RMSNorm
|
|
||||||
|
|
||||||
class LlamaRMSNorm(RMSNorm):
|
|
||||||
"""Patched LLamaRMSNorm"""
|
|
||||||
|
|
||||||
def __init__(self, hidden_size, eps=1e-6):
|
|
||||||
super().__init__(hidden_size, eps=eps)
|
|
||||||
|
|
||||||
LOG.info("patching with flash_attn.ops.rms_norm")
|
|
||||||
transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
|
|
||||||
except ImportError:
|
|
||||||
LOG.info(
|
|
||||||
"optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
||||||
# requires the attention mask to be the same as the key_padding_mask
|
# requires the attention mask to be the same as the key_padding_mask
|
||||||
|
|||||||
@@ -309,6 +309,10 @@ class ShareGPTPrompter: # pylint: disable=too-few-public-methods
|
|||||||
)
|
)
|
||||||
|
|
||||||
def build_prompt(self, source) -> Generator[str, None, None]:
|
def build_prompt(self, source) -> Generator[str, None, None]:
|
||||||
|
# ignore the system prompt if provided
|
||||||
|
if source[0]["from"] == "system":
|
||||||
|
source.pop(0)
|
||||||
|
|
||||||
if len(source) < 2:
|
if len(source) < 2:
|
||||||
# If there isn't a back and forth conversation, ignore it
|
# If there isn't a back and forth conversation, ignore it
|
||||||
# also happens on the data splitting leaving empty conversations
|
# also happens on the data splitting leaving empty conversations
|
||||||
@@ -317,12 +321,6 @@ class ShareGPTPrompter: # pylint: disable=too-few-public-methods
|
|||||||
)
|
)
|
||||||
|
|
||||||
conv = self._conversation.copy()
|
conv = self._conversation.copy()
|
||||||
|
|
||||||
# Add the conversation system prompt if provided, otherwise use the default one
|
|
||||||
if source[0]["from"] == "system":
|
|
||||||
conv.system = source[0]["value"]
|
|
||||||
source.pop(0)
|
|
||||||
|
|
||||||
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
|
roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -1,139 +0,0 @@
|
|||||||
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import signal
|
|
||||||
import sys
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# add src to the pythonpath so we don't need to pip install this
|
|
||||||
from datasets import Dataset
|
|
||||||
from optimum.bettertransformer import BetterTransformer
|
|
||||||
|
|
||||||
from axolotl.common.cli import TrainerCliArgs
|
|
||||||
from axolotl.logging_config import configure_logging
|
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
from axolotl.utils.models import load_model, load_tokenizer
|
|
||||||
from axolotl.utils.trainer import setup_trainer
|
|
||||||
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
src_dir = os.path.join(project_root, "src")
|
|
||||||
sys.path.insert(0, src_dir)
|
|
||||||
|
|
||||||
configure_logging()
|
|
||||||
LOG = logging.getLogger("axolotl.train")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TrainDatasetMeta:
|
|
||||||
"""
|
|
||||||
dataclass to capture the dataset specific options for training
|
|
||||||
"""
|
|
||||||
|
|
||||||
train_dataset: Dataset
|
|
||||||
eval_dataset: Optional[Dataset] = None
|
|
||||||
total_num_steps: Optional[int] = None
|
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
dataset_meta: TrainDatasetMeta,
|
|
||||||
):
|
|
||||||
# load the tokenizer first
|
|
||||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
|
||||||
tokenizer = load_tokenizer(cfg)
|
|
||||||
|
|
||||||
train_dataset = dataset_meta.train_dataset
|
|
||||||
eval_dataset = dataset_meta.eval_dataset
|
|
||||||
total_num_steps = dataset_meta.total_num_steps
|
|
||||||
|
|
||||||
# Load the model and tokenizer
|
|
||||||
LOG.info("loading model and (optionally) peft_config...")
|
|
||||||
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
|
||||||
|
|
||||||
safe_serialization = cfg.save_safetensors is True
|
|
||||||
|
|
||||||
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
|
||||||
possible_checkpoints = [
|
|
||||||
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
|
||||||
]
|
|
||||||
if len(possible_checkpoints) > 0:
|
|
||||||
sorted_paths = sorted(
|
|
||||||
possible_checkpoints,
|
|
||||||
key=lambda path: int(path.split("-")[-1]),
|
|
||||||
)
|
|
||||||
cfg.resume_from_checkpoint = sorted_paths[-1]
|
|
||||||
LOG.info(
|
|
||||||
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
|
|
||||||
)
|
|
||||||
resume_from_checkpoint = cfg.resume_from_checkpoint
|
|
||||||
|
|
||||||
trainer = setup_trainer(
|
|
||||||
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
|
||||||
)
|
|
||||||
|
|
||||||
model.config.use_cache = False
|
|
||||||
|
|
||||||
if torch.__version__ >= "2" and sys.platform != "win32":
|
|
||||||
LOG.info("Compiling torch model")
|
|
||||||
model = torch.compile(model)
|
|
||||||
|
|
||||||
# go ahead and presave, so we have the adapter config available to inspect
|
|
||||||
if peft_config:
|
|
||||||
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
|
||||||
peft_config.save_pretrained(cfg.output_dir)
|
|
||||||
|
|
||||||
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
|
||||||
if cfg.local_rank == 0:
|
|
||||||
|
|
||||||
def terminate_handler(_, __, model):
|
|
||||||
if cfg.flash_optimum:
|
|
||||||
model = BetterTransformer.reverse(model)
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
signal.signal(
|
|
||||||
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG.info("Starting trainer...")
|
|
||||||
if cfg.group_by_length:
|
|
||||||
LOG.info("hang tight... sorting dataset for group_by_length")
|
|
||||||
|
|
||||||
if not Path(cfg.output_dir).is_dir():
|
|
||||||
os.makedirs(cfg.output_dir, exist_ok=True)
|
|
||||||
tokenizer.save_pretrained(cfg.output_dir)
|
|
||||||
if cfg.flash_optimum:
|
|
||||||
with torch.backends.cuda.sdp_kernel(
|
|
||||||
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
|
||||||
):
|
|
||||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
|
||||||
else:
|
|
||||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
|
||||||
|
|
||||||
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
|
||||||
|
|
||||||
if cfg.relora_steps:
|
|
||||||
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
|
||||||
model = model.merge_and_unload()
|
|
||||||
else:
|
|
||||||
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
|
||||||
return model, tokenizer
|
|
||||||
|
|
||||||
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
|
||||||
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
|
||||||
if cfg.fsdp:
|
|
||||||
trainer.save_model(cfg.output_dir)
|
|
||||||
elif cfg.local_rank == 0:
|
|
||||||
if cfg.flash_optimum:
|
|
||||||
model = BetterTransformer.reverse(model)
|
|
||||||
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
|
||||||
|
|
||||||
return model, tokenizer
|
|
||||||
@@ -1,19 +1,9 @@
|
|||||||
"""Callbacks for Trainer class"""
|
"""Callbacks for Trainer class"""
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import TYPE_CHECKING, Dict, List
|
|
||||||
|
|
||||||
import evaluate
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import torch
|
|
||||||
import torch.distributed as dist
|
|
||||||
from datasets import load_dataset
|
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from tqdm import tqdm
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
TrainerCallback,
|
TrainerCallback,
|
||||||
TrainerControl,
|
TrainerControl,
|
||||||
@@ -23,20 +13,8 @@ from transformers import (
|
|||||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
||||||
|
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
from axolotl.utils.distributed import (
|
|
||||||
barrier,
|
|
||||||
gather_scalar_from_all_ranks,
|
|
||||||
get_world_size,
|
|
||||||
is_distributed,
|
|
||||||
is_main_process,
|
|
||||||
zero_first,
|
|
||||||
)
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from axolotl.utils.trainer import AxolotlTrainingArguments
|
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.callbacks")
|
LOG = logging.getLogger("axolotl.callbacks")
|
||||||
IGNORE_INDEX = -100
|
|
||||||
|
|
||||||
|
|
||||||
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
||||||
@@ -118,202 +96,3 @@ class GPUStatsCallback(
|
|||||||
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
||||||
self.logged = True
|
self.logged = True
|
||||||
return control
|
return control
|
||||||
|
|
||||||
|
|
||||||
def bench_eval_callback_factory(trainer, tokenizer):
|
|
||||||
accuracy = evaluate.load("accuracy")
|
|
||||||
abcd_idx = [
|
|
||||||
tokenizer("A", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("B", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("C", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("D", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("E", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("F", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("G", add_special_tokens=False).input_ids[0],
|
|
||||||
]
|
|
||||||
bench_split = "eval"
|
|
||||||
|
|
||||||
def transform_bench_subject(example):
|
|
||||||
# Split on ':' and trim whitespace
|
|
||||||
parts = example["subject"].split(":")
|
|
||||||
first_part = (
|
|
||||||
parts[0].strip().lower().replace("-", "_")
|
|
||||||
) # Lowercase the first part
|
|
||||||
second_part = (
|
|
||||||
parts[1].strip().replace("-", "_") if len(parts) > 1 else "all"
|
|
||||||
) # Replace hyphens with underscores
|
|
||||||
|
|
||||||
# Return the transformed values
|
|
||||||
return {"name": first_part, "subject": second_part}
|
|
||||||
|
|
||||||
if trainer.args.bench_dataset == "mmlu-zs":
|
|
||||||
bench_dataset = load_dataset(
|
|
||||||
"openaccess-ai-collective/mmlu-evals",
|
|
||||||
data_files={
|
|
||||||
"eval": "zero_shot_mmlu_val.json",
|
|
||||||
"test": "zero_shot_mmlu_test.json",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
# bench_dataset = bench_dataset.remove_columns("subject")
|
|
||||||
# MMLU Five-shot (Eval/Test only)
|
|
||||||
elif trainer.args.bench_dataset in ["mmlu", "mmlu-fs"]:
|
|
||||||
bench_dataset = load_dataset(
|
|
||||||
"openaccess-ai-collective/mmlu-evals",
|
|
||||||
data_files={
|
|
||||||
"eval": "five_shot_mmlu_val.json",
|
|
||||||
"test": "five_shot_mmlu_test.json",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
# bench_dataset = bench_dataset.remove_columns('subject')
|
|
||||||
elif "/" in trainer.args.bench_dataset:
|
|
||||||
bench_ds = trainer.args.bench_dataset
|
|
||||||
bench_ds_name = "/".join(bench_ds.split("/", 2)[:2])
|
|
||||||
bench_ds_data_file = "/".join(bench_ds.split("/", 2)[2:])
|
|
||||||
bench_dataset = load_dataset(
|
|
||||||
bench_ds_name,
|
|
||||||
data_files={
|
|
||||||
"eval": bench_ds_data_file,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
bench_dataset["eval"] = bench_dataset["eval"].map(transform_bench_subject)
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"unhandled value `{trainer.args.bench_dataset}` for bench_dataset training args"
|
|
||||||
)
|
|
||||||
bench_dataset = bench_dataset[trainer.args.bench_split]
|
|
||||||
if trainer.args.max_bench_samples is not None:
|
|
||||||
bench_dataset = bench_dataset.select(range(trainer.args.max_bench_samples))
|
|
||||||
|
|
||||||
def tokenize_evals(example):
|
|
||||||
source = f"{tokenizer.bos_token}{example['input']}"
|
|
||||||
target = f"{example['output']}{tokenizer.eos_token}"
|
|
||||||
|
|
||||||
tokenized_source = tokenizer(
|
|
||||||
source,
|
|
||||||
max_length=2048,
|
|
||||||
truncation=True,
|
|
||||||
add_special_tokens=False,
|
|
||||||
)
|
|
||||||
tokenized_target = tokenizer(
|
|
||||||
target,
|
|
||||||
max_length=2048,
|
|
||||||
truncation=True,
|
|
||||||
add_special_tokens=False,
|
|
||||||
)
|
|
||||||
input_ids = tokenized_source["input_ids"] + tokenized_target["input_ids"]
|
|
||||||
labels = [IGNORE_INDEX] * len(tokenized_source["input_ids"]) + tokenized_target[
|
|
||||||
"input_ids"
|
|
||||||
]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"labels": labels,
|
|
||||||
"subject": example["subject"],
|
|
||||||
}
|
|
||||||
|
|
||||||
with zero_first(is_main_process()):
|
|
||||||
bench_dataset = bench_dataset.map(tokenize_evals)
|
|
||||||
bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)
|
|
||||||
|
|
||||||
class BenchEvalCallback(TrainerCallback):
|
|
||||||
"""
|
|
||||||
TrainerCallback that runs the MMLU evals
|
|
||||||
"""
|
|
||||||
|
|
||||||
def on_evaluate(
|
|
||||||
self,
|
|
||||||
args: AxolotlTrainingArguments,
|
|
||||||
state: TrainerState, # pylint: disable=unused-argument
|
|
||||||
control: TrainerControl, # pylint: disable=unused-argument
|
|
||||||
metrics: Dict[str, float], # pylint: disable=unused-argument
|
|
||||||
**kwargs, # pylint: disable=unused-argument
|
|
||||||
):
|
|
||||||
data_loader = trainer.get_bench_dataloader(
|
|
||||||
bench_dataset.remove_columns(["input", "subject", "output", "name"])
|
|
||||||
)
|
|
||||||
trainer.model.eval()
|
|
||||||
preds, refs = [], []
|
|
||||||
loss_bench = 0
|
|
||||||
for batch in tqdm(data_loader, total=len(data_loader)):
|
|
||||||
(loss, logits, labels) = trainer.prediction_step(
|
|
||||||
trainer.model,
|
|
||||||
batch,
|
|
||||||
prediction_loss_only=False,
|
|
||||||
)
|
|
||||||
# There are two tokens, the output, and eos token.
|
|
||||||
for i, logit in enumerate(logits):
|
|
||||||
label_non_zero_id = (batch["labels"][i] != IGNORE_INDEX).nonzero()[
|
|
||||||
0
|
|
||||||
][0]
|
|
||||||
logit_abcd = logit[label_non_zero_id - 1][abcd_idx]
|
|
||||||
preds.append(torch.argmax(logit_abcd).item())
|
|
||||||
labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:, 0]
|
|
||||||
refs += [
|
|
||||||
abcd_idx.index(label) if label in abcd_idx else -1
|
|
||||||
for label in labels.tolist()
|
|
||||||
]
|
|
||||||
loss_bench += loss.item()
|
|
||||||
# Extract results by subject.
|
|
||||||
bench_name = bench_dataset["name"]
|
|
||||||
bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
|
|
||||||
for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
|
|
||||||
bench_names[s]["preds"].append(p)
|
|
||||||
bench_names[s]["refs"].append(r)
|
|
||||||
barrier()
|
|
||||||
local_bench_names = bench_names
|
|
||||||
gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
|
|
||||||
# Gather results from all GPUs to GPU 0
|
|
||||||
|
|
||||||
loss_bench_ranks = gather_scalar_from_all_ranks(
|
|
||||||
lambda: loss_bench, get_world_size()
|
|
||||||
)
|
|
||||||
len_data_loader_ranks = gather_scalar_from_all_ranks(
|
|
||||||
lambda: len(data_loader), get_world_size()
|
|
||||||
)
|
|
||||||
|
|
||||||
if is_distributed() and not is_main_process():
|
|
||||||
dist.gather_object(local_bench_names, dst=0)
|
|
||||||
else:
|
|
||||||
if is_distributed():
|
|
||||||
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
|
||||||
else:
|
|
||||||
gathered_bench_names = [local_bench_names]
|
|
||||||
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
|
||||||
results = {f"{bench_split}_bench_loss": bench_loss}
|
|
||||||
|
|
||||||
# Combine results from all GPUs
|
|
||||||
combined_bench_names: Dict[str, Dict[str, List]] = {}
|
|
||||||
for bench_name in gathered_bench_names:
|
|
||||||
for name, data in bench_name.items():
|
|
||||||
if name not in combined_bench_names:
|
|
||||||
combined_bench_names[name] = {"refs": [], "preds": []}
|
|
||||||
combined_bench_names[name]["refs"].extend(data["refs"])
|
|
||||||
combined_bench_names[name]["preds"].extend(data["preds"])
|
|
||||||
|
|
||||||
bench_scores = []
|
|
||||||
bench_refs = []
|
|
||||||
bench_preds = []
|
|
||||||
for (
|
|
||||||
bench_name
|
|
||||||
) in combined_bench_names: # pylint: disable=consider-using-dict-items
|
|
||||||
bench_score = accuracy.compute(
|
|
||||||
references=combined_bench_names[bench_name]["refs"],
|
|
||||||
predictions=combined_bench_names[bench_name]["preds"],
|
|
||||||
)["accuracy"]
|
|
||||||
bench_refs.extend(combined_bench_names[bench_name]["refs"])
|
|
||||||
bench_preds.extend(combined_bench_names[bench_name]["preds"])
|
|
||||||
if not pd.isna(bench_score):
|
|
||||||
results[
|
|
||||||
f"{bench_split}_bench_accuracy_{bench_name}"
|
|
||||||
] = bench_score
|
|
||||||
bench_scores.append(bench_score)
|
|
||||||
else:
|
|
||||||
results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0
|
|
||||||
bench_scores.append(0.0)
|
|
||||||
results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores)
|
|
||||||
results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute(
|
|
||||||
references=bench_refs, predictions=bench_preds
|
|
||||||
)["accuracy"]
|
|
||||||
trainer.log(results)
|
|
||||||
|
|
||||||
return BenchEvalCallback
|
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import os
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
from axolotl.utils.models import load_model_config
|
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
@@ -70,16 +69,6 @@ def normalize_config(cfg):
|
|||||||
else:
|
else:
|
||||||
cfg.torch_dtype = torch.float32
|
cfg.torch_dtype = torch.float32
|
||||||
|
|
||||||
model_config = load_model_config(cfg)
|
|
||||||
|
|
||||||
# figure out if the model is llama
|
|
||||||
cfg.is_llama_derived_model = (
|
|
||||||
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
|
|
||||||
or cfg.is_llama_derived_model
|
|
||||||
or "llama" in cfg.base_model
|
|
||||||
or (cfg.model_type and "llama" in cfg.model_type.lower())
|
|
||||||
)
|
|
||||||
|
|
||||||
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,8 @@
|
|||||||
"""
|
"""
|
||||||
utility helpers for distributed checks
|
utility helpers for distributed checks
|
||||||
"""
|
"""
|
||||||
import os
|
|
||||||
from contextlib import contextmanager
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from accelerate import Accelerator
|
from accelerate import Accelerator
|
||||||
|
|
||||||
@@ -45,10 +43,6 @@ def is_main_process():
|
|||||||
return dist.get_rank() == 0
|
return dist.get_rank() == 0
|
||||||
|
|
||||||
|
|
||||||
def get_world_size():
|
|
||||||
return int(os.getenv("WORLD_SIZE", "1"))
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
def zero_first(is_main):
|
def zero_first(is_main):
|
||||||
"""
|
"""
|
||||||
@@ -59,37 +53,3 @@ def zero_first(is_main):
|
|||||||
yield
|
yield
|
||||||
if is_main: # then rank 0 waits after it has run the context
|
if is_main: # then rank 0 waits after it has run the context
|
||||||
barrier()
|
barrier()
|
||||||
|
|
||||||
|
|
||||||
def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
|
|
||||||
"""
|
|
||||||
Run a callable 'fn' on all ranks and gather the results on the specified rank.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
- fn (callable): A function that computes the value. This should not have any side effects.
|
|
||||||
- rank (int, optional): The rank that gathers the values. Default is 0.
|
|
||||||
- world_size (int, optional): Total number of processes in the current distributed setup.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
- A list of computed values from all ranks if on the gathering rank, otherwise None.
|
|
||||||
"""
|
|
||||||
value_scalar = fn()
|
|
||||||
if not is_distributed():
|
|
||||||
return [value_scalar]
|
|
||||||
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
|
||||||
|
|
||||||
if not is_main_process():
|
|
||||||
dist.gather(value_tensor, dst=0)
|
|
||||||
else:
|
|
||||||
gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
|
|
||||||
dist.gather(value_tensor, gather_list=gathered_tensors, dst=0)
|
|
||||||
|
|
||||||
# Convert tensors back to their original type (int or float)
|
|
||||||
gathered_values = []
|
|
||||||
for tensor in gathered_tensors:
|
|
||||||
if tensor == tensor.int():
|
|
||||||
gathered_values.append(int(tensor.item()))
|
|
||||||
else:
|
|
||||||
gathered_values.append(float(tensor.item()))
|
|
||||||
return gathered_values
|
|
||||||
return None
|
|
||||||
|
|||||||
@@ -159,13 +159,11 @@ def load_model(
|
|||||||
if cfg.model_revision:
|
if cfg.model_revision:
|
||||||
model_kwargs["revision"] = cfg.model_revision
|
model_kwargs["revision"] = cfg.model_revision
|
||||||
if cfg.gptq:
|
if cfg.gptq:
|
||||||
model_config = load_model_config(cfg)
|
# TODO we should figure out how read the models config.json first
|
||||||
if hasattr(model_config, "quantization_config"):
|
model_kwargs["quantization_config"] = GPTQConfig(
|
||||||
LOG.warning("model config does not contain quantization_config information")
|
bits=cfg.gptq_bits,
|
||||||
else:
|
disable_exllama=True,
|
||||||
model_kwargs["quantization_config"] = GPTQConfig(
|
)
|
||||||
**model_config.quantization_config
|
|
||||||
)
|
|
||||||
if cfg.adapter == "qlora" and cfg.load_in_4bit:
|
if cfg.adapter == "qlora" and cfg.load_in_4bit:
|
||||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||||
load_in_4bit=True,
|
load_in_4bit=True,
|
||||||
@@ -328,7 +326,7 @@ def load_model(
|
|||||||
|
|
||||||
# LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
|
# LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
|
||||||
# convert them back to fp16/bf16 for flash-attn compatibility.
|
# convert them back to fp16/bf16 for flash-attn compatibility.
|
||||||
if needs_fa2_dtype or (cfg.flash_attention and cfg.is_llama_derived_model):
|
if needs_fa2_dtype and (cfg.flash_attention and cfg.is_llama_derived_model):
|
||||||
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
||||||
for name, module in model.named_modules():
|
for name, module in model.named_modules():
|
||||||
if "norm" in name:
|
if "norm" in name:
|
||||||
|
|||||||
@@ -8,13 +8,13 @@ from termcolor import colored
|
|||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
|
|
||||||
def check_dataset_labels(dataset, tokenizer, num_examples=5, text_only=False):
|
def check_dataset_labels(dataset, tokenizer):
|
||||||
# the dataset is already shuffled, so let's just check the first 5 elements
|
# the dataset is already shuffled, so let's just check the first 5 elements
|
||||||
for idx in range(num_examples):
|
for idx in range(5):
|
||||||
check_example_labels(dataset[idx], tokenizer, text_only=text_only)
|
check_example_labels(dataset[idx], tokenizer)
|
||||||
|
|
||||||
|
|
||||||
def check_example_labels(example, tokenizer, text_only=False):
|
def check_example_labels(example, tokenizer):
|
||||||
# Get the input_ids, labels, and attention_mask from the dataset
|
# Get the input_ids, labels, and attention_mask from the dataset
|
||||||
input_ids = example["input_ids"]
|
input_ids = example["input_ids"]
|
||||||
labels = example["labels"]
|
labels = example["labels"]
|
||||||
@@ -29,10 +29,8 @@ def check_example_labels(example, tokenizer, text_only=False):
|
|||||||
decoded_input_token = tokenizer.decode(input_id)
|
decoded_input_token = tokenizer.decode(input_id)
|
||||||
# Choose the color based on whether the label has the ignore value or not
|
# Choose the color based on whether the label has the ignore value or not
|
||||||
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
||||||
colored_token = colored(decoded_input_token, color) + (
|
colored_token = colored(decoded_input_token, color) + colored(
|
||||||
not text_only
|
f"({label_id}, {mask}, {input_id})", "white"
|
||||||
and colored(f"({label_id}, {mask}, {input_id})", "white")
|
|
||||||
or ""
|
|
||||||
)
|
)
|
||||||
colored_tokens.append(colored_token)
|
colored_tokens.append(colored_token)
|
||||||
|
|
||||||
|
|||||||
@@ -12,15 +12,9 @@ from typing import Optional, Union
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
import transformers
|
|
||||||
from datasets import Dataset, set_caching_enabled
|
from datasets import Dataset, set_caching_enabled
|
||||||
from torch.optim.lr_scheduler import OneCycleLR
|
from torch.optim.lr_scheduler import OneCycleLR
|
||||||
from torch.utils.data import (
|
from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
|
||||||
DataLoader,
|
|
||||||
DistributedSampler,
|
|
||||||
RandomSampler,
|
|
||||||
SequentialSampler,
|
|
||||||
)
|
|
||||||
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
|
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
|
||||||
from transformers.trainer_pt_utils import SequentialDistributedSampler
|
from transformers.trainer_pt_utils import SequentialDistributedSampler
|
||||||
|
|
||||||
@@ -29,7 +23,6 @@ from axolotl.utils.callbacks import (
|
|||||||
GPUStatsCallback,
|
GPUStatsCallback,
|
||||||
SaveBetterTransformerModelCallback,
|
SaveBetterTransformerModelCallback,
|
||||||
SavePeftModelCallback,
|
SavePeftModelCallback,
|
||||||
bench_eval_callback_factory,
|
|
||||||
)
|
)
|
||||||
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
||||||
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
||||||
@@ -134,27 +127,6 @@ class AxolotlTrainingArguments(TrainingArguments):
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
||||||
)
|
)
|
||||||
bench_split: Optional[str] = field(
|
|
||||||
default="eval", metadata={"help": "The benchmark split to run on"}
|
|
||||||
)
|
|
||||||
bench_dataset: Optional[str] = field(
|
|
||||||
default="pharaouk/dharma-1/dharma_1_mini.json",
|
|
||||||
metadata={
|
|
||||||
"help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
do_bench_eval: Optional[bool] = field(
|
|
||||||
default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
|
|
||||||
)
|
|
||||||
max_bench_samples: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
bench_source_max_len: int = field(
|
|
||||||
default=2048, metadata={"help": "Maximum source sequence length for bench."}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AxolotlTrainer(Trainer):
|
class AxolotlTrainer(Trainer):
|
||||||
@@ -164,10 +136,6 @@ class AxolotlTrainer(Trainer):
|
|||||||
|
|
||||||
args = None # type: AxolotlTrainingArguments
|
args = None # type: AxolotlTrainingArguments
|
||||||
|
|
||||||
def __init__(self, *args, bench_data_collator=None, **kwargs):
|
|
||||||
self.bench_data_collator = bench_data_collator
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def create_scheduler(
|
def create_scheduler(
|
||||||
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
||||||
):
|
):
|
||||||
@@ -258,31 +226,6 @@ class AxolotlTrainer(Trainer):
|
|||||||
)
|
)
|
||||||
return super().get_eval_dataloader(eval_dataset)
|
return super().get_eval_dataloader(eval_dataset)
|
||||||
|
|
||||||
def _get_bench_sampler(
|
|
||||||
self, bench_dataset: Dataset
|
|
||||||
) -> Optional[torch.utils.data.Sampler]:
|
|
||||||
if self.args.world_size <= 1:
|
|
||||||
return SequentialSampler(bench_dataset)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_bench_dataloader(
|
|
||||||
self,
|
|
||||||
bench_dataset: Dataset,
|
|
||||||
) -> Union[DataLoader, MultipackDistributedDataloader]:
|
|
||||||
dataloader_params = {
|
|
||||||
"batch_size": self.args.eval_batch_size,
|
|
||||||
"collate_fn": self.bench_data_collator,
|
|
||||||
"num_workers": self.args.dataloader_num_workers,
|
|
||||||
"pin_memory": self.args.dataloader_pin_memory,
|
|
||||||
}
|
|
||||||
|
|
||||||
if not isinstance(bench_dataset, torch.utils.data.IterableDataset):
|
|
||||||
dataloader_params["sampler"] = self._get_bench_sampler(bench_dataset)
|
|
||||||
dataloader_params["drop_last"] = self.args.dataloader_drop_last
|
|
||||||
|
|
||||||
return DataLoader(bench_dataset, **dataloader_params)
|
|
||||||
# return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params))
|
|
||||||
|
|
||||||
def compute_loss(self, model, inputs, return_outputs=False):
|
def compute_loss(self, model, inputs, return_outputs=False):
|
||||||
# use one's weighted cross entropy loss calc
|
# use one's weighted cross entropy loss calc
|
||||||
# if self.args.sample_packing:
|
# if self.args.sample_packing:
|
||||||
@@ -361,7 +304,7 @@ def add_position_ids(sample):
|
|||||||
|
|
||||||
|
|
||||||
def drop_long_seq(sample, sequence_len=2048):
|
def drop_long_seq(sample, sequence_len=2048):
|
||||||
return len(sample["input_ids"]) <= sequence_len and len(sample["input_ids"]) > 0
|
return len(sample["input_ids"]) <= sequence_len
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
@@ -401,16 +344,6 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
|
|||||||
LOG.info(f"📝 UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
|
LOG.info(f"📝 UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
|
||||||
cfg.total_num_tokens = total_num_tokens
|
cfg.total_num_tokens = total_num_tokens
|
||||||
|
|
||||||
if not cfg.total_supervised_tokens:
|
|
||||||
total_supervised_tokens = (
|
|
||||||
train_dataset.data.column("labels")
|
|
||||||
.to_pandas()
|
|
||||||
.apply(lambda x: np.sum(np.array(x) != -100))
|
|
||||||
.sum()
|
|
||||||
)
|
|
||||||
LOG.info(f"`total_supervised_tokens: {total_supervised_tokens}`")
|
|
||||||
cfg.total_supervised_tokens = total_supervised_tokens
|
|
||||||
|
|
||||||
if cfg.sample_packing_eff_est:
|
if cfg.sample_packing_eff_est:
|
||||||
total_num_steps = (
|
total_num_steps = (
|
||||||
# match count to len est in dataloader
|
# match count to len est in dataloader
|
||||||
@@ -568,20 +501,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
"steps" if cfg.save_steps else "epoch"
|
"steps" if cfg.save_steps else "epoch"
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.do_bench_eval:
|
|
||||||
training_arguments_kwargs["do_bench_eval"] = cfg.do_bench_eval
|
|
||||||
if cfg.bench_dataset:
|
|
||||||
training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
|
|
||||||
|
|
||||||
# DDP Config
|
|
||||||
if cfg.ddp_timeout:
|
|
||||||
training_arguments_kwargs["ddp_timeout"] = cfg.ddp_timeout
|
|
||||||
# see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
|
|
||||||
if cfg.ddp_bucket_cap_mb:
|
|
||||||
training_arguments_kwargs["ddp_bucket_cap_mb"] = cfg.ddp_bucket_cap_mb
|
|
||||||
if cfg.ddp_broadcast_buffers is not None:
|
|
||||||
training_arguments_kwargs["ddp_broadcast_buffers"] = cfg.ddp_broadcast_buffers
|
|
||||||
|
|
||||||
training_args = AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
|
training_args = AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
|
||||||
max_steps=total_num_steps if cfg.max_steps else -1,
|
max_steps=total_num_steps if cfg.max_steps else -1,
|
||||||
max_seq_length=cfg.sequence_len,
|
max_seq_length=cfg.sequence_len,
|
||||||
@@ -694,16 +613,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
**data_collator_kwargs,
|
**data_collator_kwargs,
|
||||||
),
|
),
|
||||||
bench_data_collator=transformers.DataCollatorForSeq2Seq(
|
|
||||||
tokenizer,
|
|
||||||
return_tensors="pt",
|
|
||||||
**data_collator_kwargs,
|
|
||||||
),
|
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
**trainer_kwargs,
|
**trainer_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.do_bench_eval:
|
|
||||||
trainer.add_callback(bench_eval_callback_factory(trainer, tokenizer))
|
|
||||||
|
|
||||||
return trainer
|
return trainer
|
||||||
|
|||||||
Reference in New Issue
Block a user