Peft deepspeed resume (#1227)

* import deepspeed integration

* monkeypatch peft adapater with deepspeed for resume from checkpoint

* fix patch

* fix patches attempt 2

* make sure to set lora_model_dir

* skip pylint for deepspeed.utils

* pick up upstream fix in transformers

* remove monkeypatch for deepspeed/peft fix

* no need to set the lora_model_dir on resume

* unset load_in_*bit when using quant config

* guard before del

* better handling of load_in* kwargs
This commit is contained in:
Wing Lian
2024-01-31 18:13:29 -05:00
committed by GitHub
parent 25e037fe2d
commit c67fb71583
4 changed files with 35 additions and 26 deletions

View File

@@ -57,6 +57,21 @@ def train(
eval_dataset = dataset_meta.eval_dataset
total_num_steps = dataset_meta.total_num_steps
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
possible_checkpoints = [
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
]
if len(possible_checkpoints) > 0:
sorted_paths = sorted(
possible_checkpoints,
key=lambda path: int(path.split("-")[-1]),
)
cfg.resume_from_checkpoint = sorted_paths[-1]
LOG.info(
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
)
resume_from_checkpoint = cfg.resume_from_checkpoint
# Load the model and tokenizer
msg = "loading model"
if cfg.adapter:
@@ -79,21 +94,6 @@ def train(
safe_serialization = cfg.save_safetensors is True
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
possible_checkpoints = [
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
]
if len(possible_checkpoints) > 0:
sorted_paths = sorted(
possible_checkpoints,
key=lambda path: int(path.split("-")[-1]),
)
cfg.resume_from_checkpoint = sorted_paths[-1]
LOG.info(
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
)
resume_from_checkpoint = cfg.resume_from_checkpoint
if cfg.unfrozen_parameters:
freeze_parameters_except(model, cfg.unfrozen_parameters)