diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index e2b1ccc2b..793dcc6a0 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -977,7 +977,6 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, Trainer):
                 packed_seq_lens=[seq_len] * batch_size, total_seq_len=total_seq_len
             )
 
-        # Get the loss from the parent implementation
         loss = super().training_step(model, inputs, num_items_in_batch)
 
         return loss
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index a38e4aa6c..afb3e37a0 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -552,7 +552,7 @@ class ModelLoader:
 
             patch_self_attn_lora(self.cfg)
 
-        if self.cfg.sequence_parallel_degree > 1:
+        if self.cfg.sequence_parallel_degree and self.cfg.sequence_parallel_degree > 1:
             from axolotl.monkeypatch.attention.ring_attn import register_ring_attn
 
             # Initialize ring attn for sequence parallelism. This must be done after