better handling of multipack dataset length (#2296)

2025-02-01 21:10:34 -05:00
parent a20f17689b
commit 80e1468b8d
1 changed files with 11 additions and 32 deletions
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -4,7 +4,6 @@ Multipack Batch Sampler
 """
 import logging
 import math
-import os
 from typing import Any, Iterable, List, Union

 import numba
@@ -117,6 +116,7 @@ class MultipackBatchSampler(BatchSampler):
        lengths: np.ndarray,
        packing_efficiency_estimate: float = 1.0,
        drop_last: bool = False,
+        num_count_samples: int = 16,
        **kwargs,
    ):
        super().__init__(sampler, batch_size, drop_last)
@@ -133,6 +133,9 @@ class MultipackBatchSampler(BatchSampler):
        self.eff_total_used = 0
        self.eff_total_slots = 0

+        # The number of times to calculate the batches to determine the minimum packed dataset length for the local rank
+        self.num_count_samples = num_count_samples
+        # the minimum packed dataset length across all ranks determined by a gather/broadcast
        self.len_across_ranks = None

    def set_epoch(self, epoch: int):
@@ -169,6 +172,9 @@ class MultipackBatchSampler(BatchSampler):

    def __iter__(self):
        batches = self.generate_batches(set_stats=True)
+        if self.len_across_ranks:
+            # make sure the batches we iterate over is truncated to the same min length across all ranks
+            batches = batches[: self.len_across_ranks]
        return iter(batches)

    def num_batches(self):
@@ -195,42 +201,15 @@ class MultipackBatchSampler(BatchSampler):
    def gather_len_batches(self, num):
        def calc_min_len(estimates: list[(int, float)]):
            LOG.info(f"gather_len_batches: {repr(estimates)}")
-            return math.floor(0.998 * min(estimates))
+            return math.floor(min(estimates))

        min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len)
        return min_len_batches

    def __len__(self):
        if not self.len_across_ranks:
-            len_batches = self.num_batches()
+            len_batches = min(
+                [self.num_batches() for _ in range(self.num_count_samples)]
+            )
            self.len_across_ranks = self.gather_len_batches(len_batches)
        return self.len_across_ranks
-
-    def _len_est(self):
-        efficiency = (
-            self.packing_efficiency_estimate
-            if self.packing_efficiency_estimate
-            else self.gather_efficiency()
-        )
-        world_size = int(os.getenv("WORLD_SIZE", "1"))
-        lengths_sum = np.sum(self.lengths)
-        lengths_sum_per_device = lengths_sum // world_size
-        LOG.info(
-            f"packing_efficiency_estimate: {efficiency} "
-            f"total_num_tokens per device: {lengths_sum_per_device}"
-        )
-
-        # shave off 1% + 1 for dealing with variance in packing from random sampler to sampler
-        return max(
-            0,
-            (
-                world_size
-                * math.floor(
-                    0.99
-                    * lengths_sum_per_device
-                    / efficiency
-                    // (self.batch_max_len * self.batch_size)
-                )
-                - 1
-            ),
-        )