diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py index e2b1ccc2b..793dcc6a0 100644 --- a/src/axolotl/core/trainers/base.py +++ b/src/axolotl/core/trainers/base.py @@ -977,7 +977,6 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, Trainer): packed_seq_lens=[seq_len] * batch_size, total_seq_len=total_seq_len ) - # Get the loss from the parent implementation loss = super().training_step(model, inputs, num_items_in_batch) return loss diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index a38e4aa6c..afb3e37a0 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -552,7 +552,7 @@ class ModelLoader: patch_self_attn_lora(self.cfg) - if self.cfg.sequence_parallel_degree > 1: + if self.cfg.sequence_parallel_degree and self.cfg.sequence_parallel_degree > 1: from axolotl.monkeypatch.attention.ring_attn import register_ring_attn # Initialize ring attn for sequence parallelism. This must be done after