updates

2025-03-10 21:18:04 +00:00
parent b44a207248
commit 4190ad0647
7 changed files with 187 additions and 432 deletions
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -32,6 +32,9 @@ tokenizer_legacy:
 resize_token_embeddings_to_32x:
 # Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
 shrink_embeddings:
+# Whether to load the model with randomly initialized weights. Useful for
+# pre-training a model from scratch or debugging purposes.
+random_init:

 # (Internal use only)
 # Used to identify which the model is based on
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -871,10 +871,13 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        collator: Type[
            Union[
                V2BatchSamplerDataCollatorForSeq2Seq,
+                V2SequenceParallelPackedDataCollator,
+                SequenceParallelPackedDataCollator,
                BatchSamplerDataCollatorForSeq2Seq,
                DataCollatorForSeq2Seq,
                DataCollatorWithFlattening,
                RewardDataCollatorWithPadding,
+                SequenceParallelDataCollator,
            ]
        ]
        collator_args = [self.tokenizer]
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -412,8 +412,21 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, Trainer):
            if self.args.curriculum_sampling:
                sampler = SequentialSampler(self.train_dataset)
            else:
+                generator = None
+                if self.args.sequence_parallel_size > 1:
+                    generator = torch.Generator()
+                    generator.manual_seed(self.args.getattr("seed", 0))
+
                sampler = RandomSampler(self.train_dataset)

+            # if dist.get_rank() == 0:
+            #     import ipdb; ipdb.set_trace()
+            # dist.barrier()
+
+            # if dist.get_rank() == 1:
+            #     import ipdb; ipdb.set_trace()
+            # dist.barrier()
+
            return MultipackBatchSampler(
                sampler,
                lengths=get_dataset_lengths(self.train_dataset),
@@ -426,7 +439,14 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, Trainer):
            )
        if self.args.curriculum_sampling:
            return SequentialSampler(self.train_dataset)
-        return super()._get_train_sampler()
+
+        sampler = super()._get_train_sampler()
+        if self.args.sequence_parallel_size > 1:
+            generator = torch.Generator()
+            generator.manual_seed(self.args.getattr("seed", 0))
+            sampler.generator = generator
+
+        return sampler

    def _get_eval_sampler(
        self, eval_dataset: Dataset
@@ -478,6 +498,12 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, Trainer):
                dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = seed_worker

+            if dist.get_rank() == 0:
+                import ipdb
+
+                ipdb.set_trace()
+            dist.barrier()
+
            self.accelerator.even_batches = False
            return self.accelerator.prepare_data_loader(
                DataLoader(train_dataset, **dataloader_params)
@@ -805,60 +831,36 @@ class AxolotlTrainer(SchedulerMixin, OptimizerMixin, Trainer):
        self,
        model: nn.Module,
        inputs: dict[str, torch.Tensor | Any],
-        num_items_in_batch=None,
+        num_items_in_batch: int = None,
    ) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.
        """
        if self.args.sequence_parallel_size > 1:
-            # At this point, inputs should already be partitioned by the sequence parallel data collator
-            # We'll just log some information about the partitioned data
+            # At this point, inputs should already be partitioned by the sequence
+            # parallel data collator
            batch_size = inputs["input_ids"].shape[0]
            seq_len = inputs["input_ids"].shape[1]

            # Get rank and SP information
            sp_group = get_ring_attn_group()
-            rank = dist.get_rank()
-            sp_rank = dist.get_rank(group=sp_group) if sp_group else rank
            world_size = (
                dist.get_world_size(group=sp_group)
                if sp_group
                else dist.get_world_size()
            )

-            # Sample tokens from our slice to verify partitioning
-            sample_start = (
-                inputs["input_ids"][0, :5].tolist()
-                if seq_len >= 5
-                else inputs["input_ids"][0, :].tolist()
-            )
-            sample_end = (
-                inputs["input_ids"][0, -5:].tolist()
-                if seq_len >= 5
-                else inputs["input_ids"][0, :].tolist()
-            )
-
-            LOG.info(
-                f"GPU {rank} (SP rank {sp_rank}) | Step {self.state.global_step} | "
-                f"Slice shape: batch_size={batch_size}, seq_len={seq_len} | "
-                f"Sample start: {sample_start}, end: {sample_end}"
-            )
-
            # Calculate the full sequence length across all GPUs in this SP group
-            full_seq_len = seq_len * world_size
+            total_seq_len = seq_len * world_size

            # Pass the partitioned sequence information to ring flash attention
-            self._update_ring_flash_attn_params([seq_len] * batch_size, full_seq_len)
+            self._update_ring_flash_attn_params(
+                packed_seq_lens=[seq_len] * batch_size, total_seq_len=total_seq_len
+            )

        # Get the loss from the parent implementation
        loss = super().training_step(model, inputs, num_items_in_batch)

-        if self.args.sequence_parallel_size > 1:
-            rank = dist.get_rank()
-            LOG.info(
-                f"GPU {rank} | Step {self.state.global_step} | Loss: {loss.item()}"
-            )
-
        return loss

    def _update_ring_flash_attn_params(self, packed_seq_lens, total_seq_len):
--- a/src/axolotl/utils/collators/sequence_parallel.py
+++ b/src/axolotl/utils/collators/sequence_parallel.py
@@ -1,12 +1,11 @@
 """Module for sequence parallelism data collators."""

+import logging
 from dataclasses import dataclass

 import torch
 import torch.distributed as dist
-from accelerate.logging import get_logger

-from axolotl.logging_config import configure_logging
 from axolotl.monkeypatch.attention.ring_attn import get_ring_attn_group
 from axolotl.utils.collators.batching import (
    BatchSamplerDataCollatorForSeq2Seq,
@@ -14,32 +13,12 @@ from axolotl.utils.collators.batching import (
    V2BatchSamplerDataCollatorForSeq2Seq,
 )

-configure_logging()
-LOG = get_logger(__name__)
+logger = logging.getLogger(__name__)


-def find_sample_boundaries(position_ids):
-    """
-    Find the boundaries between packed samples in a sequence by looking for
-    where position_ids decrease.
-
-    Returns:
-        List of boundary indices for each sequence in the batch
-    """
-    batch_boundaries = []
-
-    for i in range(position_ids.shape[0]):
-        seq = position_ids[i]
-        boundaries = []
-        for j in range(1, len(seq)):
-            if seq[j] < seq[j - 1]:
-                boundaries.append(j)
-        batch_boundaries.append(boundaries)
-
-    return batch_boundaries
-
-
-def adjust_position_ids_for_slice(position_ids, start_idx):
+def adjust_position_ids_for_slice(
+    position_ids: list | torch.Tensor, start_idx: int
+) -> torch.Tensor:
    """
    Adjust position IDs for a sliced sequence to maintain proper relative positions.
    This handles the case where position IDs might not be contiguous due to sample packing.
@@ -64,370 +43,135 @@ def adjust_position_ids_for_slice(position_ids, start_idx):
            if seq[j] < seq[j - 1]:
                boundaries.append(j)

-        # Debug: log the found boundaries
-        LOG.debug(f"Sequence {i}: Found sample boundaries at positions {boundaries}")
-
        # No need to adjust if there are no boundaries or this is a single sample
        if not boundaries:
-            old_values = seq[0:5].tolist()  # Sample of original values
            adjusted_pos_ids[i] = seq - start_idx
-            new_values = adjusted_pos_ids[i, 0:5].tolist()  # Sample of new values
-            LOG.debug(
-                f"Sequence {i}: No boundaries, subtracting {start_idx} uniformly. Example values before: {old_values}, after: {new_values}"
-            )
            continue

        # Adjust each segment separately
        prev_boundary = 0
-        for boundary_idx, boundary in enumerate(boundaries):
-            segment = seq[prev_boundary:boundary]
-            old_values = segment[
-                0 : min(5, len(segment))
-            ].tolist()  # Sample of original values
+        for boundary in boundaries:
            adjusted_pos_ids[i, prev_boundary:boundary] -= start_idx
-            new_values = adjusted_pos_ids[
-                i, prev_boundary : min(prev_boundary + 5, boundary)
-            ].tolist()  # Sample of new values
-            LOG.debug(
-                f"Sequence {i}, Segment {boundary_idx}: Adjusting positions {prev_boundary}-{boundary-1}. Example values before: {old_values}, after: {new_values}"
-            )
            prev_boundary = boundary

        # Last segment
-        segment = seq[prev_boundary:]
-        old_values = segment[
-            0 : min(5, len(segment))
-        ].tolist()  # Sample of original values
        adjusted_pos_ids[i, prev_boundary:] -= start_idx
-        new_values = adjusted_pos_ids[
-            i, prev_boundary : min(prev_boundary + 5, len(seq))
-        ].tolist()  # Sample of new values
-        LOG.debug(
-            f"Sequence {i}, Last segment: Adjusting positions {prev_boundary}-end. Example values before: {old_values}, after: {new_values}"
-        )

    return adjusted_pos_ids


-def check_for_boundary_splits(boundaries, slice_start, slice_end):
+class SequenceParallelMixin:
    """
-    Check if any sample boundaries fall near the edge of a sequence slice.
-    These edge cases could cause issues with gradient computation.
-
-    Args:
-        boundaries: List of indices where sample boundaries occur
-        slice_start: Start index of this GPU's slice
-        slice_end: End index of this GPU's slice
-
-    Returns:
-        List of potentially problematic boundaries
-    """
-    # Consider a boundary "near" an edge if it's within 5 tokens
-    buffer_size = 5
-    problem_boundaries = []
-
-    for boundary in boundaries:
-        # Check if boundary is near the start of the slice
-        if slice_start <= boundary < slice_start + buffer_size:
-            problem_boundaries.append((boundary, "start", boundary - slice_start))
-        # Check if boundary is near the end of the slice
-        elif slice_end - buffer_size <= boundary < slice_end:
-            problem_boundaries.append((boundary, "end", slice_end - boundary))
-
-    return problem_boundaries
-
-
-@dataclass
-class SequenceParallelPackedDataCollator(BatchSamplerDataCollatorForSeq2Seq):
-    """
-    Data collator for sequence parallelism with sample packing.
-    Combines multiple samples into a packed sequence, then slices it for each GPU.
+    Mixin to add sequence parallelism slicing to data collators.
    """

-    debug_level: str = "debug"  # Can be "debug" for more verbose output
-
-    def __call__(self, features, return_tensors=None):
-        # First, use the parent collator to handle sample packing and padding
-        batch = super().__call__(features, return_tensors=return_tensors)
-
-        sp_group = get_ring_attn_group()
-        if sp_group is None:
-            return batch  # Not using sequence parallelism
-
+    def __post_init__(self):
        # Get information about our position in the SP group
-        rank = dist.get_rank(group=sp_group)
-        world_size = dist.get_world_size(group=sp_group)
+        sp_group = get_ring_attn_group()
+        self.rank = dist.get_rank(group=sp_group)
+        self.world_size = dist.get_world_size(group=sp_group)

-        # Enable debug level if requested
-        if self.debug_level == "debug":
-            original_shapes = {
-                k: v.shape if hasattr(v, "shape") else None for k, v in batch.items()
-            }
-            LOG.info(f"GPU {rank}: Original batch shapes: {original_shapes}")
+    def apply_sequence_parallelism(
+        self, batch: dict[str, torch.Tensor]
+    ) -> torch.Tensor:
+        """
+        Apply sequence parallelism slicing to a batch.

-            if "position_ids" in batch:
-                # Find and log sample boundaries before slicing
-                boundaries = find_sample_boundaries(batch["position_ids"])
-                for i, seq_boundaries in enumerate(boundaries):
-                    LOG.info(
-                        f"GPU {rank}: Sequence {i} has {len(seq_boundaries)} packed samples with boundaries at {seq_boundaries}"
-                    )
+        Args:
+            batch: Batch dictionary from parent collator.

+        Returns:
+            Sliced batch dictionary.
+        """
        # Process keys that need to be sliced
        for key in ["input_ids", "attention_mask", "labels"]:
            if key in batch:
                seq_len = batch[key].shape[1]
-                slice_size = seq_len // world_size
-                start_idx = rank * slice_size
-                end_idx = start_idx + slice_size if rank < world_size - 1 else seq_len
-
-                LOG.info(
-                    f"GPU {rank}: Slicing {key} from {start_idx} to {end_idx} (total len: {seq_len})"
+                slice_size = seq_len // self.world_size
+                start_idx = self.rank * slice_size
+                end_idx = (
+                    start_idx + slice_size
+                    if self.rank < self.world_size - 1
+                    else seq_len
                )

-                if self.debug_level == "debug" and key == "input_ids":
-                    # Log portions of the input to verify correct slicing
-                    for i in range(
-                        min(2, batch[key].shape[0])
-                    ):  # Look at up to 2 sequences
-                        # Sample the beginning, middle and end of the sequence before slicing
-                        start_sample = batch[key][i, 0:5].tolist()
-                        mid_sample = batch[key][
-                            i, seq_len // 2 : seq_len // 2 + 5
-                        ].tolist()
-                        end_sample = batch[key][i, -5:].tolist()
-                        LOG.info(
-                            f"GPU {rank}, Seq {i} before slicing: start={start_sample}, mid={mid_sample}, end={end_sample}"
-                        )
-
-                batch[key] = batch[key][:, start_idx:end_idx]
-
-                if self.debug_level == "debug" and key == "input_ids":
-                    # Log after slicing to verify
-                    for i in range(min(2, batch[key].shape[0])):
-                        sliced_sample = batch[key][i, 0:5].tolist()
-                        sliced_end = batch[key][i, -5:].tolist()
-                        LOG.info(
-                            f"GPU {rank}, Seq {i} after slicing: start={sliced_sample}, end={sliced_end}"
-                        )
-
-        # Handle position_ids specially if present (important for packed sequences)
-        if "position_ids" in batch:
-            # For position_ids, we need to adjust them after slicing
-            # Each position_id should be relative to its slice
-            pos_ids = batch["position_ids"]
-            seq_len = pos_ids.shape[1]
-            slice_size = seq_len // world_size
-            start_idx = rank * slice_size
-            end_idx = start_idx + slice_size if rank < world_size - 1 else seq_len
-
-            # Find boundaries before slicing
-            if self.debug_level == "debug":
-                full_boundaries = find_sample_boundaries(pos_ids)
-
-                # Check for boundaries that fall near slice edges
-                for i, boundaries in enumerate(full_boundaries):
-                    problem_boundaries = check_for_boundary_splits(
-                        boundaries, start_idx, end_idx
+                if key == "input_ids":
+                    # Before slicing
+                    non_pad_tokens_total = (batch["input_ids"] != 128001).sum().item()
+                    logger.info(
+                        f"GPU {self.rank}: Total sequence length: {seq_len}, "
+                        f"Non-padding tokens: {non_pad_tokens_total}"
                    )
-                    if problem_boundaries:
-                        LOG.warning(
-                            f"GPU {rank}: Sequence {i} has sample boundaries near slice edges: {problem_boundaries}"
-                        )
+                    logger.info(f"GPU {self.rank} token IDs: {batch['input_ids']}")

-            batch["position_ids"] = pos_ids[:, start_idx:end_idx]
-
-            # Find boundaries after slicing to verify correct transfer
-            if self.debug_level == "debug":
-                sliced_boundaries = find_sample_boundaries(batch["position_ids"])
-                for i, boundaries in enumerate(sliced_boundaries):
-                    LOG.info(
-                        f"GPU {rank}: After slicing, sequence {i} has boundaries at {boundaries}"
+                    # After slicing
+                    non_pad_tokens_slice = (batch["input_ids"] != 128001).sum().item()
+                    logger.info(
+                        f"GPU {self.rank}: Slice {start_idx}-{end_idx}, "
+                        f"Non-padding tokens in slice: {non_pad_tokens_slice}"
                    )

-            # Adjust position_ids to be relative to the start of this slice
-            # Only subtract if not the first GPU in the group
-            if rank > 0:
-                # Find boundaries between samples in the position_ids
-                # This preserves the sample packing structure
-                old_pos_ids = batch["position_ids"].clone()
-                batch["position_ids"] = adjust_position_ids_for_slice(
-                    batch["position_ids"], start_idx
-                )
+                dist.barrier()

-                if self.debug_level == "debug":
-                    # Compare before and after adjustment
-                    for i in range(min(2, old_pos_ids.shape[0])):
-                        before = old_pos_ids[i, 0:10].tolist()
-                        after = batch["position_ids"][i, 0:10].tolist()
-                        LOG.info(
-                            f"GPU {rank}, Seq {i} position_ids adjustment: before={before}, after={after}"
-                        )
-
-        # Add gradient norm tracking for debugging
-        if self.debug_level == "debug":
-            # Attach hook to track gradient norms during backward pass
-            def hook_fn(grad):
-                norm = grad.norm().item()
-                LOG.info(f"GPU {rank}: Gradient norm = {norm:.4f}")
-                # Record any abnormally high gradients
-                if norm > 10.0:
-                    LOG.warning(f"GPU {rank}: High gradient norm detected: {norm:.4f}")
-                return grad
-
-            # Apply hook to input_ids embeddings if it goes through backward pass
-            if "input_ids" in batch and batch["input_ids"].requires_grad:
-                batch["input_ids"].register_hook(hook_fn)
-
-        return batch
-
-
-@dataclass
-class V2SequenceParallelPackedDataCollator(V2BatchSamplerDataCollatorForSeq2Seq):
-    """
-    Data collator for sequence parallelism with V2 sample packing.
-    """
-
-    debug_level: str = "debug"  # Can be "debug" for more verbose output
-
-    def __call__(self, features, return_tensors=None):
-        # Implementation similar to SequenceParallelPackedDataCollator with V2 base
-        # First, use the parent collator to handle sample packing and padding
-        batch = super().__call__(features, return_tensors=return_tensors)
-
-        sp_group = get_ring_attn_group()
-        if sp_group is None:
-            return batch  # Not using sequence parallelism
-
-        # Get information about our position in the SP group
-        rank = dist.get_rank(group=sp_group)
-        world_size = dist.get_world_size(group=sp_group)
-
-        # Enable debug level if requested
-        if self.debug_level == "debug":
-            original_shapes = {
-                k: v.shape if hasattr(v, "shape") else None for k, v in batch.items()
-            }
-            LOG.info(f"GPU {rank}: Original batch shapes: {original_shapes}")
-
-            if "position_ids" in batch:
-                # Find and log sample boundaries before slicing
-                boundaries = find_sample_boundaries(batch["position_ids"])
-                for i, seq_boundaries in enumerate(boundaries):
-                    LOG.info(
-                        f"GPU {rank}: Sequence {i} has {len(seq_boundaries)} packed samples with boundaries at {seq_boundaries}"
-                    )
-
-        # Process keys that need to be sliced
-        for key in ["input_ids", "attention_mask", "labels"]:
-            if key in batch:
-                seq_len = batch[key].shape[1]
-                slice_size = seq_len // world_size
-                start_idx = rank * slice_size
-                end_idx = start_idx + slice_size if rank < world_size - 1 else seq_len
-
-                if self.debug_level == "debug" and key == "input_ids":
-                    # Log portions of the input to verify correct slicing
-                    for i in range(
-                        min(2, batch[key].shape[0])
-                    ):  # Look at up to 2 sequences
-                        # Sample the beginning, middle and end of the sequence before slicing
-                        start_sample = batch[key][i, 0:5].tolist()
-                        mid_sample = batch[key][
-                            i, seq_len // 2 : seq_len // 2 + 5
-                        ].tolist()
-                        end_sample = batch[key][i, -5:].tolist()
-                        LOG.info(
-                            f"GPU {rank}, Seq {i} before slicing: start={start_sample}, mid={mid_sample}, end={end_sample}"
-                        )
-
-                batch[key] = batch[key][:, start_idx:end_idx]
-
-        # Handle position_ids specially (same as in SequenceParallelPackedDataCollator)
-        if "position_ids" in batch:
-            # Implementation identical to the one in SequenceParallelPackedDataCollator
-            pos_ids = batch["position_ids"]
-            seq_len = pos_ids.shape[1]
-            slice_size = seq_len // world_size
-            start_idx = rank * slice_size
-            end_idx = start_idx + slice_size if rank < world_size - 1 else seq_len
-
-            # Find boundaries before slicing
-            if self.debug_level == "debug":
-                full_boundaries = find_sample_boundaries(pos_ids)
-
-                # Check for boundaries that fall near slice edges
-                for i, boundaries in enumerate(full_boundaries):
-                    problem_boundaries = check_for_boundary_splits(
-                        boundaries, start_idx, end_idx
-                    )
-                    if problem_boundaries:
-                        LOG.warning(
-                            f"GPU {rank}: Sequence {i} has sample boundaries near slice edges: {problem_boundaries}"
-                        )
-
-            batch["position_ids"] = pos_ids[:, start_idx:end_idx]
-
-            # Adjust position_ids to be relative to the start of this slice
-            if rank > 0:
-                batch["position_ids"] = adjust_position_ids_for_slice(
-                    batch["position_ids"], start_idx
-                )
-
-        return batch
-
-
-@dataclass
-class SequenceParallelDataCollator(DataCollatorForSeq2Seq):
-    """
-    Data collator for sequence parallelism without sample packing.
-    """
-
-    debug_level: str = "debug"  # Can be "debug" for more verbose output
-
-    def __call__(self, features, return_tensors=None):
-        # First, use the parent collator to pad everything correctly
-        batch = super().__call__(features, return_tensors=return_tensors)
-
-        sp_group = get_ring_attn_group()
-        if sp_group is None:
-            return batch  # Not using sequence parallelism
-
-        # Get information about our position in the SP group
-        rank = dist.get_rank(group=sp_group)
-        world_size = dist.get_world_size(group=sp_group)
-
-        if self.debug_level == "debug":
-            original_shapes = {
-                k: v.shape if hasattr(v, "shape") else None for k, v in batch.items()
-            }
-            LOG.info(f"GPU {rank}: Original batch shapes: {original_shapes}")
-
-        # Process keys that need to be sliced
-        for key in ["input_ids", "attention_mask", "labels"]:
-            if key in batch:
-                seq_len = batch[key].shape[1]
-                slice_size = seq_len // world_size
-                start_idx = rank * slice_size
-                end_idx = start_idx + slice_size if rank < world_size - 1 else seq_len
-
-                LOG.info(
-                    f"GPU {rank}: Slicing {key} from {start_idx} to {end_idx} (total len: {seq_len})"
-                )
                batch[key] = batch[key][:, start_idx:end_idx]

        # Handle position_ids if present
        if "position_ids" in batch:
            pos_ids = batch["position_ids"]
            seq_len = pos_ids.shape[1]
-            slice_size = seq_len // world_size
-            start_idx = rank * slice_size
-            end_idx = start_idx + slice_size if rank < world_size - 1 else seq_len
+            slice_size = seq_len // self.world_size
+            start_idx = self.rank * slice_size
+            end_idx = (
+                start_idx + slice_size if self.rank < self.world_size - 1 else seq_len
+            )

            batch["position_ids"] = pos_ids[:, start_idx:end_idx]

-            # For non-packed sequences, we can simply subtract start_idx from all position_ids
-            if rank > 0:
-                batch["position_ids"] -= start_idx
+            # Adjust position_ids to be relative to the slice start
+            if self.rank > 0:
+                batch["position_ids"] = adjust_position_ids_for_slice(
+                    batch["position_ids"], start_idx
+                )

        return batch
+
+
+@dataclass
+class SequenceParallelPackedDataCollator(
+    SequenceParallelMixin, BatchSamplerDataCollatorForSeq2Seq
+):
+    """
+    Data collator for sequence parallelism with sample packing. Combines multiple
+    samples into a packed sequence, then slices it for each GPU.
+    """
+
+    def __call__(self, features, return_tensors=None):
+        # Use the parent collator to handle sample packing and padding
+        batch = super().__call__(features, return_tensors=return_tensors)
+        return self.apply_sequence_parallelism(batch)
+
+
+@dataclass
+class V2SequenceParallelPackedDataCollator(
+    SequenceParallelMixin, V2BatchSamplerDataCollatorForSeq2Seq
+):
+    """
+    Data collator for sequence parallelism with V2 sample packing.
+    """
+
+    def __call__(self, features, return_tensors=None):
+        # Use the parent collator to handle sample packing and padding
+        batch = super().__call__(features, return_tensors=return_tensors)
+        return self.apply_sequence_parallelism(batch)
+
+
+@dataclass
+class SequenceParallelDataCollator(SequenceParallelMixin, DataCollatorForSeq2Seq):
+    """
+    Data collator for sequence parallelism without sample packing.
+    """
+
+    def __call__(self, features, return_tensors=None):
+        # Use the parent collator to pad everything correctly
+        batch = super().__call__(features, return_tensors=return_tensors)
+        return self.apply_sequence_parallelism(batch)
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -67,7 +67,12 @@ from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_offload_wrap
 from axolotl.utils.lora_embeddings import get_linear_embedding_layers
 from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant

-LOG = logging.getLogger("axolotl")
+LOG = logging.getLogger(__name__)
+
+MULTIMODEL_AUTO_MODEL_MAPPING = {
+    "llava": LlavaForConditionalGeneration,
+    "mllama": MllamaForConditionalGeneration,
+}


 # copied from accelerator.FullyShardedDataParallelPlugin
@@ -476,7 +481,7 @@ class ModelLoader:
        else:
            self.text_model_config = self.model_config

-        self.AutoModelLoader = AutoModelForCausalLM  # pylint: disable=invalid-name
+        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name

    def apply_patches(self) -> None:
        # load any patches from plugins
@@ -612,7 +617,7 @@ class ModelLoader:

            patch_self_attn_lora()

-    def patch_llama_derived_model(self) -> None:
+    def patch_llama_derived_model(self):
        """Modify all llama derived models in one block"""
        self.patch_loss_llama()

@@ -662,25 +667,16 @@ class ModelLoader:
                "Shifted-sparse attention not currently implemented without flash attention."
            )

-    def set_auto_model_loader(self) -> None:
-        """set self.AutoModelLoader
-        - default value: AutoModelForCausalLM (set at __init__)
-        - when using a multi modality model, self.AutoModelLoader should
-          be set according to model type of the model
+    def set_auto_model_loader(self):
+        """
+        Set self.auto_model_loader. Defaults to `transformers.AutoModelForCausalLM`
+        (set at `__init__`). When using a multimodal model, `self.auto_model_loader`
+        should be set according to the type of the model.
        """
        if self.cfg.is_multimodal:
-            if self.model_config.model_type == "llava":
-                self.AutoModelLoader = (  # pylint: disable=invalid-name
-                    LlavaForConditionalGeneration
-                )
-            elif self.model_config.model_type == "mllama":
-                self.AutoModelLoader = (  # pylint: disable=invalid-name
-                    MllamaForConditionalGeneration
-                )
-            else:
-                self.AutoModelLoader = (
-                    AutoModelForVision2Seq  # pylint: disable=invalid-name
-                )
+            self.auto_model_loader = MULTIMODEL_AUTO_MODEL_MAPPING.get(
+                self.model_config.model_type, AutoModelForVision2Seq
+            )

    def set_device_map_config(self) -> None:
        device_map = self.cfg.device_map
@@ -704,7 +700,7 @@ class ModelLoader:
            from accelerate import infer_auto_device_map

            with init_empty_weights():
-                model_canvas = self.AutoModelLoader.from_config(
+                model_canvas = self.auto_model_loader.from_config(
                    self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                )
@@ -925,11 +921,27 @@ class ModelLoader:

            if self.cfg.is_multimodal:
                self.model_config.text_config = self.text_model_config
-            self.model = self.AutoModelLoader.from_pretrained(
-                self.base_model,
-                config=self.model_config,
-                **self.model_kwargs,
-            )
+
+            # Load model with random initialization if specified
+            if self.cfg.random_init:
+                # AutoModel classes support the from_config method
+                if self.auto_model_loader in [
+                    AutoModelForCausalLM,
+                    AutoModelForVision2Seq,
+                ]:
+                    self.model = self.auto_model_loader.from_config(
+                        config=self.model_config,
+                    )
+                else:
+                    self.model = self.auto_model_loader(
+                        config=self.model_config,
+                    )
+            else:
+                self.model = self.auto_model_loader.from_pretrained(
+                    self.base_model,
+                    config=self.model_config,
+                    **self.model_kwargs,
+                )

            #  TODO (MengqingCao) split these patches seperately
            if self.cfg.flash_attention and not self.inference:
@@ -967,7 +979,7 @@ class ModelLoader:
            if self.cfg.is_multimodal:
                self.model_config.text_config = self.text_model_config
            if self.cfg.gptq:
-                self.model = self.AutoModelLoader.from_pretrained(
+                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
@@ -1000,7 +1012,7 @@ class ModelLoader:
            if self.cfg.gptq:
                if self.cfg.is_multimodal:
                    self.model_config.text_config = self.text_model_config
-                self.model = self.AutoModelLoader.from_pretrained(
+                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
@@ -1020,7 +1032,7 @@ class ModelLoader:

                if self.cfg.is_multimodal:
                    self.model_config.text_config = self.text_model_config
-                self.model = self.AutoModelLoader.from_pretrained(
+                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
@@ -1316,7 +1328,7 @@ def load_model(
    """
    Load a model for a given configuration and tokenizer.
    """
-    loader = ModelLoader(
+    model_loader = ModelLoader(
        cfg,
        tokenizer,
        processor=processor,
@@ -1324,7 +1336,7 @@ def load_model(
        reference_model=reference_model,
        **kwargs,
    )
-    return loader.load_model()
+    return model_loader.load_model()


 def load_adapter(model, cfg, adapter, inference=False):
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -443,6 +443,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                    - 1
                )
                * cfg.num_epochs
+                * cfg.sequence_parallel_size
            )
            LOG.debug(
                f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}",
@@ -473,7 +474,11 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
            LOG.debug(f"data_loader_len: {data_loader_len}", main_process_only=True)
            # FIXME: is there a bug here somewhere? the total num steps depends
            # on the agreed on value for sample_packing_eff_est
-            total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
+            total_num_steps = int(
+                math.floor(
+                    data_loader_len * cfg.num_epochs * cfg.sequence_parallel_size
+                )
+            )

            def calc_sample_packing_eff_est(estimates: List[float]):
                LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}")
@@ -494,7 +499,12 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
            )
    else:
        total_num_steps = int(
-            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
+            math.ceil(
+                len(train_dataset)
+                * cfg.num_epochs
+                * cfg.sequence_parallel_size
+                / cfg.batch_size
+            )
        )
    LOG.debug(f"total_num_steps: {total_num_steps}", main_process_only=True)
    return total_num_steps
--- a/tests/e2e/patched/test_sequence_parallelism.py
+++ b/tests/e2e/patched/test_sequence_parallelism.py
@@ -14,7 +14,6 @@ with patch.dict("sys.modules", {"ring_flash_attn": ring_flash_attn_mock}):
    from axolotl.utils.collators.sequence_parallel import (
        adjust_position_ids_for_slice,
        check_for_boundary_splits,
-        find_sample_boundaries,
    )


@@ -30,24 +29,6 @@ def partial_state():
 class TestSequenceParallelHelpers:
    """Test helper functions used in sequence parallelism."""

-    def test_find_sample_boundaries(self):
-        """Test detection of boundaries in position_ids."""
-        # Create sample position_ids with multiple sequences
-        position_ids = torch.tensor(
-            [
-                # First sequence with 2 samples (boundary at index 5)
-                [0, 1, 2, 3, 4, 0, 1, 2, 3],
-                # Second sequence with 3 samples (boundaries at 3 and 7)
-                [0, 1, 2, 0, 1, 2, 3, 0, 1],
-            ]
-        )
-
-        boundaries = find_sample_boundaries(position_ids)
-
-        assert len(boundaries) == 2
-        assert boundaries[0] == [5]  # First sequence has boundary at index 5
-        assert boundaries[1] == [3, 7]  # Second sequence has boundaries at 3 and 7
-
    def test_adjust_position_ids_for_slice(self, partial_state):
        """Test position_ids adjustment for sequence slices."""
        # Create sample position_ids with multiple sequences