From 17330c05a3dc4da4774481bd6fd5c02cdae153c0 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 23 Sep 2024 15:21:00 -0400 Subject: [PATCH] shampoo checkpoint save workaround --- src/axolotl/core/trainer_builder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index 160fee101..86605ac2f 100755 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -977,7 +977,11 @@ class AxolotlTrainer(SchedulerMixin, Trainer): run_dir = self._get_output_dir(trial=trial) output_dir = os.path.join(run_dir, checkpoint_folder) os.makedirs(output_dir, exist_ok=True) - return super()._save_checkpoint(model, trial, metrics=metrics) + try: + return super()._save_checkpoint(model, trial, metrics=metrics) + except NotImplementedError as exc: + LOG.warning(f"Failed to save checkpoint: {exc}") + return None class AxolotlMambaTrainer(AxolotlTrainer):