support for true batches with multipack (#1230)

* support for true batches with multipack * patch the map dataset fetcher to handle batches with packed indexes * patch 4d mask creation for sdp attention * better handling for BetterTransformer * patch general case for 4d mask * setup forward patch. WIP * fix patch file * support for multipack w/o flash attention for llama * cleanup * add warning about bf16 vs fp16 for multipack with sdpa * bugfixes * add 4d multipack tests, refactor patches * update tests and add warnings * fix e2e file check * skip sdpa test if not at least torch 2.1.1, update docs
2024-02-01 10:18:42 -05:00
parent c67fb71583
commit 00568c1539
24 changed files with 573 additions and 246 deletions
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -117,7 +117,7 @@ class MultipackBatchSampler(BatchSampler):
        packing_efficiency_estimate: float = 1.0,
    ):
        super().__init__(sampler, batch_size, drop_last)
-        self.batch_size = None
+        self.batch_size = batch_size
        self.batch_max_len = batch_max_len
        self.lengths: np.ndarray = lengths
        self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
@@ -147,7 +147,13 @@ class MultipackBatchSampler(BatchSampler):
            n=1,
        )

-        batches = [[indices[b_idx] for b_idx in batch] for batch in batches]
+        batches = [
+            [
+                [indices[b_idx] for b_idx in batch]
+                for batch in batches[i : i + self.batch_size]
+            ]
+            for i in range(0, len(batches), self.batch_size)
+        ]

        # statistics
        if set_stats:
@@ -189,7 +195,7 @@ class MultipackBatchSampler(BatchSampler):
                    0.99
                    * lengths_sum_per_device
                    / self.packing_efficiency_estimate
-                    // self.batch_max_len
+                    // (self.batch_max_len * self.batch_size)
                )
                - 1
            ),