quickstart instructions for starting from runpod (#5)

This commit is contained in:
Wing Lian
2023-04-18 19:22:25 -04:00
committed by GitHub
parent 5cb7ea49a6
commit 0a472e1e08
10 changed files with 332 additions and 21 deletions

View File

@@ -225,7 +225,14 @@ def train(
)
logging.info("Starting trainer...")
trainer.train(resume_from_checkpoint=cfg.resume_from_checkpoint)
resume_from_checkpoint = cfg.resume_from_checkpoint
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
possible_checkpoints = [str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")]
if len(possible_checkpoints) > 0:
sorted_paths = sorted(possible_checkpoints, key=lambda path: int(path.split('-')[-1]))
resume_from_checkpoint = sorted_paths[-1]
logging.info(f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}")
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
if cfg.local_rank == 0:
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading