fix checkpints on multigpu (#481)

This commit is contained in:
Wing Lian
2023-08-26 12:00:03 -04:00
committed by GitHub
parent 56c4a94caf
commit 31f3e71764

View File

@@ -131,7 +131,7 @@ class ReLoRACallback(TrainerCallback):
and state.global_step % self.relora_steps != 0
):
if self.quantized:
if self.last_full_model != checkpoint_folder:
if is_main_process() and self.last_full_model != checkpoint_folder:
# ensure the latest full parameter save is in the latest checkpoint
# folder, so that automatic pruning of checkpoints does not remove it
LOG.info(f"moving last full parameter save to {checkpoint_folder}")