Compare commits
11 Commits
fsdp-defau
...
multi-gpu-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83d904a27d | ||
|
|
5e4a760ad8 | ||
|
|
1991946c5a | ||
|
|
f51c9c56c6 | ||
|
|
7710e81f50 | ||
|
|
48434bec54 | ||
|
|
396a7a74fc | ||
|
|
b21e4a20fe | ||
|
|
42f9642792 | ||
|
|
c56b450cf5 | ||
|
|
1e07c162f1 |
@@ -163,6 +163,8 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
||||
```
|
||||
</details>
|
||||
|
||||
- Windows: Please use WSL or Docker!
|
||||
|
||||
### Dataset
|
||||
|
||||
Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
|
||||
@@ -623,6 +625,11 @@ fsdp_config:
|
||||
# Deepspeed config path
|
||||
deepspeed:
|
||||
|
||||
# Advanced DDP Arguments
|
||||
ddp_timeout:
|
||||
ddp_bucket_cap_mb:
|
||||
ddp_broadcast_buffers:
|
||||
|
||||
# Path to torch distx for optim 'adamw_anyprecision'
|
||||
torchdistx_path:
|
||||
|
||||
@@ -665,7 +672,6 @@ fsdp:
|
||||
fsdp_config:
|
||||
fsdp_offload_params: true
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
|
||||
@@ -35,10 +35,7 @@
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": [
|
||||
0.9,
|
||||
0.95
|
||||
],
|
||||
"betas": "auto",
|
||||
"eps": 1e-8,
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
|
||||
@@ -4,9 +4,7 @@ import importlib
|
||||
import logging
|
||||
import os
|
||||
import random
|
||||
import signal
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
@@ -17,17 +15,17 @@ import yaml
|
||||
|
||||
# add src to the pythonpath so we don't need to pip install this
|
||||
from art import text2art
|
||||
from optimum.bettertransformer import BetterTransformer
|
||||
from transformers import GenerationConfig, TextStreamer
|
||||
|
||||
from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
|
||||
from axolotl.logging_config import configure_logging
|
||||
from axolotl.train import TrainDatasetMeta, train
|
||||
from axolotl.utils.config import normalize_config, validate_config
|
||||
from axolotl.utils.data import prepare_dataset
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.distributed import is_main_process
|
||||
from axolotl.utils.models import load_model, load_model_config, load_tokenizer
|
||||
from axolotl.utils.models import load_model_config, load_tokenizer
|
||||
from axolotl.utils.tokenization import check_dataset_labels
|
||||
from axolotl.utils.trainer import setup_trainer
|
||||
from axolotl.utils.wandb import setup_wandb_env_vars
|
||||
|
||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
@@ -40,26 +38,13 @@ LOG = logging.getLogger("axolotl.scripts")
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainerCliArgs:
|
||||
"""
|
||||
dataclass representing the various non-training arguments
|
||||
"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
inference: bool = field(default=False)
|
||||
merge_lora: bool = field(default=False)
|
||||
prepare_ds_only: bool = field(default=False)
|
||||
prompter: Optional[str] = field(default=None)
|
||||
shard: bool = field(default=False)
|
||||
|
||||
|
||||
def print_axolotl_text_art(suffix=None):
|
||||
font = "nancyj"
|
||||
ascii_text = " axolotl"
|
||||
if suffix:
|
||||
ascii_text += f" x {suffix}"
|
||||
ascii_art = text2art(" axolotl", font=font)
|
||||
|
||||
if is_main_process():
|
||||
print(ascii_art)
|
||||
|
||||
@@ -73,9 +58,45 @@ def get_multi_line_input() -> Optional[str]:
|
||||
return instruction
|
||||
|
||||
|
||||
def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
|
||||
if prompter == "None":
|
||||
prompter = None
|
||||
def do_merge_lora(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs,
|
||||
):
|
||||
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
||||
safe_serialization = cfg.save_safetensors is True
|
||||
|
||||
LOG.info("running merge of LoRA with base model")
|
||||
model = model.merge_and_unload()
|
||||
model.to(dtype=torch.float16)
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
LOG.info("saving merged model")
|
||||
model.save_pretrained(
|
||||
str(Path(cfg.output_dir) / "merged"),
|
||||
safe_serialization=safe_serialization,
|
||||
)
|
||||
tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
|
||||
|
||||
|
||||
def shard(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs,
|
||||
):
|
||||
model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
||||
safe_serialization = cfg.save_safetensors is True
|
||||
LOG.debug("Re-saving model w/ sharding")
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
|
||||
|
||||
def do_inference(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs,
|
||||
):
|
||||
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
||||
prompter = cli_args.prompter
|
||||
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||
|
||||
for token, symbol in default_tokens.items():
|
||||
@@ -176,141 +197,6 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b
|
||||
return not any(el in list2 for el in list1)
|
||||
|
||||
|
||||
def train(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs,
|
||||
):
|
||||
# load the tokenizer first
|
||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
|
||||
if not (
|
||||
cli_args.shard or cli_args.merge_lora or cli_args.inference
|
||||
): # don't need to load dataset for these
|
||||
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
|
||||
|
||||
if cli_args.debug or cfg.debug:
|
||||
LOG.info("check_dataset_labels...")
|
||||
check_dataset_labels(
|
||||
train_dataset.select(
|
||||
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
|
||||
),
|
||||
tokenizer,
|
||||
)
|
||||
|
||||
if cli_args.prepare_ds_only:
|
||||
LOG.info("Finished preparing dataset. Exiting...")
|
||||
return
|
||||
|
||||
# Load the model and tokenizer
|
||||
LOG.info("loading model and (optionally) peft_config...")
|
||||
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||
|
||||
safe_serialization = cfg.save_safetensors is True
|
||||
|
||||
if cli_args.merge_lora and cfg.adapter is not None:
|
||||
LOG.info("running merge of LoRA with base model")
|
||||
model = model.merge_and_unload()
|
||||
model.to(dtype=torch.float16)
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
LOG.info("saving merged model")
|
||||
model.save_pretrained(
|
||||
str(Path(cfg.output_dir) / "merged"),
|
||||
safe_serialization=safe_serialization,
|
||||
)
|
||||
tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
|
||||
return
|
||||
|
||||
if cli_args.inference:
|
||||
LOG.debug("Running inference on model")
|
||||
do_inference(cfg, model, tokenizer, prompter=cli_args.prompter)
|
||||
return
|
||||
|
||||
if cli_args.shard:
|
||||
LOG.debug("Re-saving model w/ sharding")
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
return
|
||||
|
||||
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
||||
possible_checkpoints = [
|
||||
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
||||
]
|
||||
if len(possible_checkpoints) > 0:
|
||||
sorted_paths = sorted(
|
||||
possible_checkpoints,
|
||||
key=lambda path: int(path.split("-")[-1]),
|
||||
)
|
||||
cfg.resume_from_checkpoint = sorted_paths[-1]
|
||||
LOG.info(
|
||||
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
|
||||
)
|
||||
resume_from_checkpoint = cfg.resume_from_checkpoint
|
||||
|
||||
trainer = setup_trainer(
|
||||
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
||||
)
|
||||
|
||||
model.config.use_cache = False
|
||||
|
||||
if torch.__version__ >= "2" and sys.platform != "win32":
|
||||
LOG.info("Compiling torch model")
|
||||
model = torch.compile(model)
|
||||
|
||||
# go ahead and presave, so we have the adapter config available to inspect
|
||||
if peft_config:
|
||||
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
||||
peft_config.save_pretrained(cfg.output_dir)
|
||||
|
||||
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
||||
if cfg.local_rank == 0:
|
||||
|
||||
def terminate_handler(_, __, model):
|
||||
if cfg.flash_optimum:
|
||||
model = BetterTransformer.reverse(model)
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(
|
||||
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
|
||||
)
|
||||
|
||||
LOG.info("Starting trainer...")
|
||||
if cfg.group_by_length:
|
||||
LOG.info("hang tight... sorting dataset for group_by_length")
|
||||
|
||||
if not Path(cfg.output_dir).is_dir():
|
||||
os.makedirs(cfg.output_dir, exist_ok=True)
|
||||
tokenizer.save_pretrained(cfg.output_dir)
|
||||
if cfg.flash_optimum:
|
||||
with torch.backends.cuda.sdp_kernel(
|
||||
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
||||
):
|
||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||
else:
|
||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||
|
||||
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
||||
|
||||
if cfg.relora_steps:
|
||||
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
||||
model = model.merge_and_unload()
|
||||
else:
|
||||
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
||||
return
|
||||
|
||||
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
||||
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
||||
if cfg.fsdp:
|
||||
trainer.save_model(cfg.output_dir)
|
||||
elif cfg.local_rank == 0:
|
||||
if cfg.flash_optimum:
|
||||
model = BetterTransformer.reverse(model)
|
||||
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
|
||||
|
||||
def load_cfg(config: Path = Path("examples/"), **kwargs):
|
||||
if Path(config).is_dir():
|
||||
config = choose_config(config)
|
||||
@@ -347,15 +233,55 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):
|
||||
return cfg
|
||||
|
||||
|
||||
def do_train(config: Path = Path("examples/"), **kwargs):
|
||||
def load_datasets(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs,
|
||||
) -> TrainDatasetMeta:
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
|
||||
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
|
||||
|
||||
if cli_args.debug or cfg.debug:
|
||||
LOG.info("check_dataset_labels...")
|
||||
check_dataset_labels(
|
||||
train_dataset.select(
|
||||
[
|
||||
random.randrange(0, len(train_dataset) - 1) # nosec
|
||||
for _ in range(cli_args.debug_num_examples)
|
||||
]
|
||||
),
|
||||
tokenizer,
|
||||
num_examples=cli_args.debug_num_examples,
|
||||
text_only=cli_args.debug_text_only,
|
||||
)
|
||||
|
||||
return TrainDatasetMeta(
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset,
|
||||
total_num_steps=total_num_steps,
|
||||
)
|
||||
|
||||
|
||||
def do_cli(config: Path = Path("examples/"), **kwargs):
|
||||
print_axolotl_text_art()
|
||||
parsed_cfg = load_cfg(config, **kwargs)
|
||||
parser = transformers.HfArgumentParser((TrainerCliArgs))
|
||||
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
|
||||
return_remaining_strings=True
|
||||
)
|
||||
train(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
if parsed_cli_args.inference:
|
||||
do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
elif parsed_cli_args.merge_lora:
|
||||
do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
elif parsed_cli_args.shard:
|
||||
shard(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
else:
|
||||
dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
if parsed_cli_args.prepare_ds_only:
|
||||
return
|
||||
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
fire.Fire(do_train)
|
||||
fire.Fire(do_cli)
|
||||
|
||||
0
src/axolotl/common/__init__.py
Normal file
0
src/axolotl/common/__init__.py
Normal file
43
src/axolotl/common/cli.py
Normal file
43
src/axolotl/common/cli.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
shared module for cli specific things
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from axolotl.logging_config import configure_logging
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.models import load_model, load_tokenizer
|
||||
|
||||
configure_logging()
|
||||
LOG = logging.getLogger("axolotl.common.cli")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainerCliArgs:
|
||||
"""
|
||||
dataclass representing the various non-training arguments
|
||||
"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
debug_text_only: bool = field(default=False)
|
||||
debug_num_examples: int = field(default=5)
|
||||
inference: bool = field(default=False)
|
||||
merge_lora: bool = field(default=False)
|
||||
prepare_ds_only: bool = field(default=False)
|
||||
prompter: Optional[str] = field(default=None)
|
||||
shard: bool = field(default=False)
|
||||
|
||||
|
||||
def load_model_and_tokenizer(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs,
|
||||
):
|
||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
LOG.info("loading model and (optionally) peft_config...")
|
||||
model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||
|
||||
return model, tokenizer
|
||||
@@ -1,45 +0,0 @@
|
||||
"""
|
||||
Monkeypatch to fix fsdp set state when no previous state was set
|
||||
"""
|
||||
|
||||
import contextlib
|
||||
from typing import Generator, Optional
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
from torch.distributed.fsdp.api import (
|
||||
OptimStateDictConfig,
|
||||
StateDictConfig,
|
||||
StateDictType,
|
||||
)
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
|
||||
|
||||
|
||||
@staticmethod
|
||||
@contextlib.contextmanager
|
||||
def state_dict_type_patch(
|
||||
module: nn.Module,
|
||||
state_dict_type: StateDictType,
|
||||
state_dict_config: Optional[StateDictConfig] = None,
|
||||
optim_state_dict_config: Optional[OptimStateDictConfig] = None,
|
||||
) -> Generator:
|
||||
prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(
|
||||
module,
|
||||
state_dict_type,
|
||||
state_dict_config,
|
||||
optim_state_dict_config,
|
||||
)
|
||||
yield
|
||||
if prev_state_dict_settings.state_dict_type:
|
||||
FullyShardedDataParallel.set_state_dict_type(
|
||||
module,
|
||||
prev_state_dict_settings.state_dict_type,
|
||||
prev_state_dict_settings.state_dict_config,
|
||||
prev_state_dict_settings.optim_state_dict_config,
|
||||
)
|
||||
|
||||
|
||||
def replace_fsdp_state_dict_type():
|
||||
torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.state_dict_type = (
|
||||
state_dict_type_patch
|
||||
)
|
||||
139
src/axolotl/train.py
Normal file
139
src/axolotl/train.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
# add src to the pythonpath so we don't need to pip install this
|
||||
from datasets import Dataset
|
||||
from optimum.bettertransformer import BetterTransformer
|
||||
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.logging_config import configure_logging
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.models import load_model, load_tokenizer
|
||||
from axolotl.utils.trainer import setup_trainer
|
||||
|
||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||
src_dir = os.path.join(project_root, "src")
|
||||
sys.path.insert(0, src_dir)
|
||||
|
||||
configure_logging()
|
||||
LOG = logging.getLogger("axolotl.train")
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainDatasetMeta:
|
||||
"""
|
||||
dataclass to capture the dataset specific options for training
|
||||
"""
|
||||
|
||||
train_dataset: Dataset
|
||||
eval_dataset: Optional[Dataset] = None
|
||||
total_num_steps: Optional[int] = None
|
||||
|
||||
|
||||
def train(
|
||||
*,
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs,
|
||||
dataset_meta: TrainDatasetMeta,
|
||||
):
|
||||
# load the tokenizer first
|
||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
|
||||
train_dataset = dataset_meta.train_dataset
|
||||
eval_dataset = dataset_meta.eval_dataset
|
||||
total_num_steps = dataset_meta.total_num_steps
|
||||
|
||||
# Load the model and tokenizer
|
||||
LOG.info("loading model and (optionally) peft_config...")
|
||||
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||
|
||||
safe_serialization = cfg.save_safetensors is True
|
||||
|
||||
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
||||
possible_checkpoints = [
|
||||
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
||||
]
|
||||
if len(possible_checkpoints) > 0:
|
||||
sorted_paths = sorted(
|
||||
possible_checkpoints,
|
||||
key=lambda path: int(path.split("-")[-1]),
|
||||
)
|
||||
cfg.resume_from_checkpoint = sorted_paths[-1]
|
||||
LOG.info(
|
||||
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
|
||||
)
|
||||
resume_from_checkpoint = cfg.resume_from_checkpoint
|
||||
|
||||
trainer = setup_trainer(
|
||||
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
||||
)
|
||||
|
||||
model.config.use_cache = False
|
||||
|
||||
if torch.__version__ >= "2" and sys.platform != "win32":
|
||||
LOG.info("Compiling torch model")
|
||||
model = torch.compile(model)
|
||||
|
||||
# go ahead and presave, so we have the adapter config available to inspect
|
||||
if peft_config:
|
||||
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
||||
peft_config.save_pretrained(cfg.output_dir)
|
||||
|
||||
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
||||
if cfg.local_rank == 0:
|
||||
|
||||
def terminate_handler(_, __, model):
|
||||
if cfg.flash_optimum:
|
||||
model = BetterTransformer.reverse(model)
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(
|
||||
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
|
||||
)
|
||||
|
||||
LOG.info("Starting trainer...")
|
||||
if cfg.group_by_length:
|
||||
LOG.info("hang tight... sorting dataset for group_by_length")
|
||||
|
||||
if not Path(cfg.output_dir).is_dir():
|
||||
os.makedirs(cfg.output_dir, exist_ok=True)
|
||||
tokenizer.save_pretrained(cfg.output_dir)
|
||||
if cfg.flash_optimum:
|
||||
with torch.backends.cuda.sdp_kernel(
|
||||
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
||||
):
|
||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||
else:
|
||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||
|
||||
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
||||
|
||||
if cfg.relora_steps:
|
||||
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
||||
model = model.merge_and_unload()
|
||||
else:
|
||||
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
||||
return model, tokenizer
|
||||
|
||||
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
||||
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
||||
if cfg.fsdp:
|
||||
trainer.save_model(cfg.output_dir)
|
||||
elif cfg.local_rank == 0:
|
||||
if cfg.flash_optimum:
|
||||
model = BetterTransformer.reverse(model)
|
||||
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
|
||||
return model, tokenizer
|
||||
@@ -11,6 +11,7 @@ import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from accelerate.state import PartialState
|
||||
from datasets import load_dataset
|
||||
from optimum.bettertransformer import BetterTransformer
|
||||
from tqdm import tqdm
|
||||
@@ -24,11 +25,9 @@ from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
||||
|
||||
from axolotl.utils.bench import log_gpu_memory_usage
|
||||
from axolotl.utils.distributed import (
|
||||
barrier,
|
||||
gather_scalar_from_all_ranks,
|
||||
get_world_size,
|
||||
is_main_process,
|
||||
zero_first,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@@ -36,6 +35,7 @@ if TYPE_CHECKING:
|
||||
|
||||
LOG = logging.getLogger("axolotl.callbacks")
|
||||
IGNORE_INDEX = -100
|
||||
dist_state = PartialState()
|
||||
|
||||
|
||||
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
||||
@@ -210,7 +210,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
||||
"subject": example["subject"],
|
||||
}
|
||||
|
||||
with zero_first(is_main_process()):
|
||||
with dist_state.main_process_first():
|
||||
bench_dataset = bench_dataset.map(tokenize_evals)
|
||||
bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)
|
||||
|
||||
@@ -258,7 +258,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
||||
for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
|
||||
bench_names[s]["preds"].append(p)
|
||||
bench_names[s]["refs"].append(r)
|
||||
barrier()
|
||||
dist_state.wait_for_everyone()
|
||||
local_bench_names = bench_names
|
||||
gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
|
||||
# Gather results from all GPUs to GPU 0
|
||||
@@ -275,7 +275,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
||||
else:
|
||||
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
||||
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
||||
results = {"bench_loss": bench_loss}
|
||||
results = {f"{bench_split}_bench_loss": bench_loss}
|
||||
|
||||
# Combine results from all GPUs
|
||||
combined_bench_names: Dict[str, Dict[str, List]] = {}
|
||||
@@ -287,6 +287,8 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
||||
combined_bench_names[name]["preds"].extend(data["preds"])
|
||||
|
||||
bench_scores = []
|
||||
bench_refs = []
|
||||
bench_preds = []
|
||||
for (
|
||||
bench_name
|
||||
) in combined_bench_names: # pylint: disable=consider-using-dict-items
|
||||
@@ -294,15 +296,20 @@ def bench_eval_callback_factory(trainer, tokenizer):
|
||||
references=combined_bench_names[bench_name]["refs"],
|
||||
predictions=combined_bench_names[bench_name]["preds"],
|
||||
)["accuracy"]
|
||||
bench_refs.extend(combined_bench_names[bench_name]["refs"])
|
||||
bench_preds.extend(combined_bench_names[bench_name]["preds"])
|
||||
if not pd.isna(bench_score):
|
||||
results[
|
||||
f"bench_{bench_split}_accuracy_{bench_name}"
|
||||
f"{bench_split}_bench_accuracy_{bench_name}"
|
||||
] = bench_score
|
||||
bench_scores.append(bench_score)
|
||||
else:
|
||||
results[f"bench_{bench_split}_accuracy_{bench_name}"] = 0.0
|
||||
results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0
|
||||
bench_scores.append(0.0)
|
||||
results[f"bench_{bench_split}_accuracy"] = np.mean(bench_scores)
|
||||
results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores)
|
||||
results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute(
|
||||
references=bench_refs, predictions=bench_preds
|
||||
)["accuracy"]
|
||||
trainer.log(results)
|
||||
|
||||
return BenchEvalCallback
|
||||
|
||||
@@ -152,16 +152,6 @@ def validate_config(cfg):
|
||||
if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
|
||||
raise ValueError("FSDP is not supported for falcon models")
|
||||
|
||||
if (
|
||||
cfg.fsdp
|
||||
and cfg.fsdp_config
|
||||
and cfg.fsdp_config.fsdp_state_dict_type
|
||||
and not cfg.fsdp_config.fsdp_sync_module_states
|
||||
):
|
||||
LOG.warning(
|
||||
"We recommend setting fsdp_config.fsdp_sync_module_states to `true`"
|
||||
)
|
||||
|
||||
if (
|
||||
cfg.base_model and "mpt" in cfg.base_model.lower()
|
||||
) and cfg.gradient_checkpointing:
|
||||
|
||||
@@ -7,6 +7,7 @@ from pathlib import Path
|
||||
from typing import Tuple, Union
|
||||
|
||||
import torch
|
||||
from accelerate.state import PartialState
|
||||
from datasets import (
|
||||
Dataset,
|
||||
DatasetDict,
|
||||
@@ -42,7 +43,6 @@ from axolotl.prompters import (
|
||||
SummarizeTLDRPrompter,
|
||||
)
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.distributed import is_main_process, zero_first
|
||||
from axolotl.utils.trainer import (
|
||||
calculate_total_num_steps,
|
||||
process_datasets_for_packing,
|
||||
@@ -50,11 +50,12 @@ from axolotl.utils.trainer import (
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
||||
state = PartialState()
|
||||
|
||||
|
||||
def prepare_dataset(cfg, tokenizer):
|
||||
if not cfg.pretraining_dataset:
|
||||
with zero_first(is_main_process()):
|
||||
with state.main_process_first():
|
||||
train_dataset, eval_dataset = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
||||
)
|
||||
@@ -69,7 +70,7 @@ def prepare_dataset(cfg, tokenizer):
|
||||
train_dataset = train_dataset.with_format("torch")
|
||||
eval_dataset = None
|
||||
|
||||
with zero_first(is_main_process()):
|
||||
with state.main_process_first():
|
||||
train_dataset, eval_dataset = process_datasets_for_packing(
|
||||
cfg, train_dataset, eval_dataset
|
||||
)
|
||||
@@ -507,7 +508,7 @@ def load_prepare_datasets(
|
||||
to_hash_test.encode(), usedforsecurity=False
|
||||
).hexdigest()
|
||||
|
||||
with zero_first(is_main_process()):
|
||||
with state.main_process_first():
|
||||
dataset = dataset.train_test_split(
|
||||
test_size=cfg.val_set_size,
|
||||
shuffle=False,
|
||||
|
||||
@@ -1,29 +1,27 @@
|
||||
"""
|
||||
utility helpers for distributed checks
|
||||
"""
|
||||
import os
|
||||
from contextlib import contextmanager
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from accelerate import Accelerator
|
||||
from accelerate import DistributedType
|
||||
from accelerate.state import PartialState
|
||||
from accelerate.utils import wait_for_everyone
|
||||
|
||||
accelerate = None # pylint: disable=invalid-name
|
||||
|
||||
|
||||
def load_accelerate():
|
||||
global accelerate # pylint: disable=global-statement
|
||||
accelerate = Accelerator()
|
||||
state = PartialState()
|
||||
|
||||
|
||||
def is_distributed():
|
||||
"""
|
||||
Check if distributed training is initialized.
|
||||
"""
|
||||
global accelerate # pylint: disable=global-statement
|
||||
if not accelerate:
|
||||
accelerate = Accelerator()
|
||||
return dist.is_available() and dist.is_initialized()
|
||||
return state.distributed_type in (
|
||||
DistributedType.MULTI_GPU,
|
||||
DistributedType.MULTI_CPU,
|
||||
DistributedType.DEEPSPEED,
|
||||
DistributedType.FSDP,
|
||||
)
|
||||
|
||||
|
||||
def barrier():
|
||||
@@ -31,34 +29,19 @@ def barrier():
|
||||
Acts as a barrier to wait for all processes. This ensures that all processes
|
||||
reach the barrier before proceeding further.
|
||||
"""
|
||||
if is_distributed():
|
||||
dist.barrier()
|
||||
wait_for_everyone()
|
||||
|
||||
|
||||
def is_main_process():
|
||||
def is_main_process() -> bool:
|
||||
"""
|
||||
Check if the current process is the main process.
|
||||
If not in distributed mode, always return True.
|
||||
"""
|
||||
if not is_distributed():
|
||||
return True
|
||||
return dist.get_rank() == 0
|
||||
return state.is_main_process
|
||||
|
||||
|
||||
def get_world_size():
|
||||
return int(os.getenv("WORLD_SIZE", "1"))
|
||||
|
||||
|
||||
@contextmanager
|
||||
def zero_first(is_main):
|
||||
"""
|
||||
runs the wrapped context so that rank 0 runs first before other ranks
|
||||
"""
|
||||
if not is_main: # other ranks wait first
|
||||
barrier()
|
||||
yield
|
||||
if is_main: # then rank 0 waits after it has run the context
|
||||
barrier()
|
||||
def get_world_size() -> int:
|
||||
return state.num_processes
|
||||
|
||||
|
||||
def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
|
||||
@@ -76,7 +59,7 @@ def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-n
|
||||
value_scalar = fn()
|
||||
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
||||
|
||||
if not is_main_process():
|
||||
if not state.is_main_process:
|
||||
dist.gather(value_tensor, dst=0)
|
||||
else:
|
||||
gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
|
||||
|
||||
@@ -371,7 +371,7 @@ def load_model(
|
||||
|
||||
# LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
|
||||
# convert them back to fp16/bf16 for flash-attn compatibility.
|
||||
if needs_fa2_dtype and (cfg.flash_attention and cfg.is_llama_derived_model):
|
||||
if needs_fa2_dtype or (cfg.flash_attention and cfg.is_llama_derived_model):
|
||||
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
||||
for name, module in model.named_modules():
|
||||
if "norm" in name:
|
||||
|
||||
@@ -8,13 +8,13 @@ from termcolor import colored
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
def check_dataset_labels(dataset, tokenizer):
|
||||
def check_dataset_labels(dataset, tokenizer, num_examples=5, text_only=False):
|
||||
# the dataset is already shuffled, so let's just check the first 5 elements
|
||||
for idx in range(5):
|
||||
check_example_labels(dataset[idx], tokenizer)
|
||||
for idx in range(num_examples):
|
||||
check_example_labels(dataset[idx], tokenizer, text_only=text_only)
|
||||
|
||||
|
||||
def check_example_labels(example, tokenizer):
|
||||
def check_example_labels(example, tokenizer, text_only=False):
|
||||
# Get the input_ids, labels, and attention_mask from the dataset
|
||||
input_ids = example["input_ids"]
|
||||
labels = example["labels"]
|
||||
@@ -29,8 +29,10 @@ def check_example_labels(example, tokenizer):
|
||||
decoded_input_token = tokenizer.decode(input_id)
|
||||
# Choose the color based on whether the label has the ignore value or not
|
||||
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
||||
colored_token = colored(decoded_input_token, color) + colored(
|
||||
f"({label_id}, {mask}, {input_id})", "white"
|
||||
colored_token = colored(decoded_input_token, color) + (
|
||||
not text_only
|
||||
and colored(f"({label_id}, {mask}, {input_id})", "white")
|
||||
or ""
|
||||
)
|
||||
colored_tokens.append(colored_token)
|
||||
|
||||
|
||||
@@ -361,7 +361,7 @@ def add_position_ids(sample):
|
||||
|
||||
|
||||
def drop_long_seq(sample, sequence_len=2048):
|
||||
return len(sample["input_ids"]) <= sequence_len
|
||||
return len(sample["input_ids"]) <= sequence_len and len(sample["input_ids"]) > 0
|
||||
|
||||
|
||||
@contextmanager
|
||||
@@ -401,6 +401,16 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
|
||||
LOG.info(f"📝 UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
|
||||
cfg.total_num_tokens = total_num_tokens
|
||||
|
||||
if not cfg.total_supervised_tokens:
|
||||
total_supervised_tokens = (
|
||||
train_dataset.data.column("labels")
|
||||
.to_pandas()
|
||||
.apply(lambda x: np.sum(np.array(x) != -100))
|
||||
.sum()
|
||||
)
|
||||
LOG.info(f"`total_supervised_tokens: {total_supervised_tokens}`")
|
||||
cfg.total_supervised_tokens = total_supervised_tokens
|
||||
|
||||
if cfg.sample_packing_eff_est:
|
||||
total_num_steps = (
|
||||
# match count to len est in dataloader
|
||||
@@ -471,9 +481,6 @@ def setup_fsdp_envs(cfg):
|
||||
os.environ[
|
||||
"FSDP_TRANSFORMER_CLS_TO_WRAP"
|
||||
] = cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap
|
||||
from axolotl.monkeypatch.fsdp import replace_fsdp_state_dict_type
|
||||
|
||||
replace_fsdp_state_dict_type()
|
||||
|
||||
|
||||
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
||||
@@ -582,6 +589,15 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
||||
if cfg.bench_dataset:
|
||||
training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
|
||||
|
||||
# DDP Config
|
||||
if cfg.ddp_timeout:
|
||||
training_arguments_kwargs["ddp_timeout"] = cfg.ddp_timeout
|
||||
# see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
|
||||
if cfg.ddp_bucket_cap_mb:
|
||||
training_arguments_kwargs["ddp_bucket_cap_mb"] = cfg.ddp_bucket_cap_mb
|
||||
if cfg.ddp_broadcast_buffers is not None:
|
||||
training_arguments_kwargs["ddp_broadcast_buffers"] = cfg.ddp_broadcast_buffers
|
||||
|
||||
training_args = AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
|
||||
max_steps=total_num_steps if cfg.max_steps else -1,
|
||||
max_seq_length=cfg.sequence_len,
|
||||
|
||||
Reference in New Issue
Block a user