workaround for fsdp2 optimizer save failures

This commit is contained in:
Wing Lian
2025-07-23 09:38:57 -04:00
parent 972c719d38
commit b3c04dd9fe

View File

@@ -13,9 +13,11 @@ class CheckpointSaveMixin(Trainer):
def _save_optimizer_and_scheduler(self, output_dir): def _save_optimizer_and_scheduler(self, output_dir):
try: try:
super()._save_optimizer_and_scheduler(output_dir) super()._save_optimizer_and_scheduler(output_dir)
except NotImplementedError as exc: except (NotImplementedError, KeyError) as exc:
LOG.warning( # TODO: fix fsdp2 optimizer saving
LOG.warning_once(
f"Trainer does not support saving optimizer and scheduler: {exc}\n" f"Trainer does not support saving optimizer and scheduler: {exc}\n"
"Optimizer and scheduler states were not saved - resuming from checkpoints " "Optimizer and scheduler states were not saved - resuming from checkpoints "
"for this training run will not be possible." "for this training run will not be possible.",
main_process_only=True,
) )