quickstart instructions for starting from runpod (#5)
This commit is contained in:
@@ -225,7 +225,14 @@ def train(
|
||||
)
|
||||
|
||||
logging.info("Starting trainer...")
|
||||
trainer.train(resume_from_checkpoint=cfg.resume_from_checkpoint)
|
||||
resume_from_checkpoint = cfg.resume_from_checkpoint
|
||||
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
||||
possible_checkpoints = [str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")]
|
||||
if len(possible_checkpoints) > 0:
|
||||
sorted_paths = sorted(possible_checkpoints, key=lambda path: int(path.split('-')[-1]))
|
||||
resume_from_checkpoint = sorted_paths[-1]
|
||||
logging.info(f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}")
|
||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
||||
|
||||
Reference in New Issue
Block a user