shampoo checkpoint save workaround
This commit is contained in:
@@ -977,7 +977,11 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
|
|||||||
run_dir = self._get_output_dir(trial=trial)
|
run_dir = self._get_output_dir(trial=trial)
|
||||||
output_dir = os.path.join(run_dir, checkpoint_folder)
|
output_dir = os.path.join(run_dir, checkpoint_folder)
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
return super()._save_checkpoint(model, trial, metrics=metrics)
|
try:
|
||||||
|
return super()._save_checkpoint(model, trial, metrics=metrics)
|
||||||
|
except NotImplementedError as exc:
|
||||||
|
LOG.warning(f"Failed to save checkpoint: {exc}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
class AxolotlMambaTrainer(AxolotlTrainer):
|
class AxolotlMambaTrainer(AxolotlTrainer):
|
||||||
|
|||||||
Reference in New Issue
Block a user