workaround for fsdp2 optimizer save failures

This commit is contained in:
Wing Lian
2025-07-23 09:38:57 -04:00
parent 972c719d38
commit b3c04dd9fe

View File

@@ -13,9 +13,11 @@ class CheckpointSaveMixin(Trainer):
def _save_optimizer_and_scheduler(self, output_dir):
try:
super()._save_optimizer_and_scheduler(output_dir)
except NotImplementedError as exc:
LOG.warning(
except (NotImplementedError, KeyError) as exc:
# TODO: fix fsdp2 optimizer saving
LOG.warning_once(
f"Trainer does not support saving optimizer and scheduler: {exc}\n"
"Optimizer and scheduler states were not saved - resuming from checkpoints "
"for this training run will not be possible."
"for this training run will not be possible.",
main_process_only=True,
)