From 31f3e717642b4de95263a05c9e8a57a4b1113217 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 26 Aug 2023 12:00:03 -0400 Subject: [PATCH] fix checkpints on multigpu (#481) --- src/axolotl/monkeypatch/relora.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py index e247fafd2..9dac77e18 100644 --- a/src/axolotl/monkeypatch/relora.py +++ b/src/axolotl/monkeypatch/relora.py @@ -131,7 +131,7 @@ class ReLoRACallback(TrainerCallback): and state.global_step % self.relora_steps != 0 ): if self.quantized: - if self.last_full_model != checkpoint_folder: + if is_main_process() and self.last_full_model != checkpoint_folder: # ensure the latest full parameter save is in the latest checkpoint # folder, so that automatic pruning of checkpoints does not remove it LOG.info(f"moving last full parameter save to {checkpoint_folder}")