From ac4b700daa27b249dc8cee43f3d52e07e84f4121 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 10 Aug 2023 19:01:17 -0400 Subject: [PATCH] optimization if total_num_tokens is already known --- src/axolotl/utils/dataloader.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/axolotl/utils/dataloader.py b/src/axolotl/utils/dataloader.py index ca74de271..3b281eb2c 100644 --- a/src/axolotl/utils/dataloader.py +++ b/src/axolotl/utils/dataloader.py @@ -261,9 +261,7 @@ class MultipackDistributedDataloader: batch_gen_thread.join() def _len_est(self): - indices = range(0, len(self.dataset)) - lengths = self.lengths[indices] - lengths_sum = np.cumsum(lengths)[-1] + lengths_sum = np.sum(self.lengths) lengths_sum_per_device = lengths_sum // self.device_count LOG.info( f"packing_efficiency_estimate: {self.packing_efficiency_estimate} "