diff --git a/src/axolotl/core/trainers/mixins/checkpoints.py b/src/axolotl/core/trainers/mixins/checkpoints.py index 8f994d78c..4042ef9f1 100644 --- a/src/axolotl/core/trainers/mixins/checkpoints.py +++ b/src/axolotl/core/trainers/mixins/checkpoints.py @@ -13,9 +13,11 @@ class CheckpointSaveMixin(Trainer): def _save_optimizer_and_scheduler(self, output_dir): try: super()._save_optimizer_and_scheduler(output_dir) - except NotImplementedError as exc: - LOG.warning( + except (NotImplementedError, KeyError) as exc: + # TODO: fix fsdp2 optimizer saving + LOG.warning_once( f"Trainer does not support saving optimizer and scheduler: {exc}\n" "Optimizer and scheduler states were not saved - resuming from checkpoints " - "for this training run will not be possible." + "for this training run will not be possible.", + main_process_only=True, )