shampoo checkpoint save workaround

This commit is contained in:
Wing Lian
2024-09-23 15:21:00 -04:00
parent 992ea517b7
commit 17330c05a3

View File

@@ -977,7 +977,11 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
run_dir = self._get_output_dir(trial=trial)
output_dir = os.path.join(run_dir, checkpoint_folder)
os.makedirs(output_dir, exist_ok=True)
return super()._save_checkpoint(model, trial, metrics=metrics)
try:
return super()._save_checkpoint(model, trial, metrics=metrics)
except NotImplementedError as exc:
LOG.warning(f"Failed to save checkpoint: {exc}")
return None
class AxolotlMambaTrainer(AxolotlTrainer):