diff --git a/examples/llama-3/diffusion/sft-1b.yaml b/examples/llama-3/diffusion/sft-1b.yaml
index a9a84ace4..d34144d92 100644
--- a/examples/llama-3/diffusion/sft-1b.yaml
+++ b/examples/llama-3/diffusion/sft-1b.yaml
@@ -44,7 +44,7 @@ resume_from_checkpoint:
 sdp_attention: true
 
 logging_steps: 1
-save_strategy: best
+save_strategy: epoch
 eval_strategy: epoch
 
 special_tokens:
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 7896c6088..cd4b86641 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -631,7 +631,9 @@ class AxolotlTrainer(
             logs["tokens_per_second_per_gpu"] = round(
                 self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
             )
-            logs["total_tokens"] = int(self.state.total_tokens.item())
+
+            if hasattr(self.state, "total_tokens"):
+                logs["total_tokens"] = int(self.state.total_tokens.item())
 
         del self._stored_metrics[train_eval]