workaround for fsdp2 optimizer save failures
This commit is contained in:
@@ -13,9 +13,11 @@ class CheckpointSaveMixin(Trainer):
|
|||||||
def _save_optimizer_and_scheduler(self, output_dir):
|
def _save_optimizer_and_scheduler(self, output_dir):
|
||||||
try:
|
try:
|
||||||
super()._save_optimizer_and_scheduler(output_dir)
|
super()._save_optimizer_and_scheduler(output_dir)
|
||||||
except NotImplementedError as exc:
|
except (NotImplementedError, KeyError) as exc:
|
||||||
LOG.warning(
|
# TODO: fix fsdp2 optimizer saving
|
||||||
|
LOG.warning_once(
|
||||||
f"Trainer does not support saving optimizer and scheduler: {exc}\n"
|
f"Trainer does not support saving optimizer and scheduler: {exc}\n"
|
||||||
"Optimizer and scheduler states were not saved - resuming from checkpoints "
|
"Optimizer and scheduler states were not saved - resuming from checkpoints "
|
||||||
"for this training run will not be possible."
|
"for this training run will not be possible.",
|
||||||
|
main_process_only=True,
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user