From b3c04dd9fed718f5d7dfeaeb6c7c97b57863065e Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 23 Jul 2025 09:38:57 -0400 Subject: [PATCH] workaround for fsdp2 optimizer save failures --- src/axolotl/core/trainers/mixins/checkpoints.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/axolotl/core/trainers/mixins/checkpoints.py b/src/axolotl/core/trainers/mixins/checkpoints.py index 8f994d78c..4042ef9f1 100644 --- a/src/axolotl/core/trainers/mixins/checkpoints.py +++ b/src/axolotl/core/trainers/mixins/checkpoints.py @@ -13,9 +13,11 @@ class CheckpointSaveMixin(Trainer): def _save_optimizer_and_scheduler(self, output_dir): try: super()._save_optimizer_and_scheduler(output_dir) - except NotImplementedError as exc: - LOG.warning( + except (NotImplementedError, KeyError) as exc: + # TODO: fix fsdp2 optimizer saving + LOG.warning_once( f"Trainer does not support saving optimizer and scheduler: {exc}\n" "Optimizer and scheduler states were not saved - resuming from checkpoints " - "for this training run will not be possible." + "for this training run will not be possible.", + main_process_only=True, )