From 97a2fa27819c1e3de74f3c14d51b5b47d5b23aa6 Mon Sep 17 00:00:00 2001 From: Seungduk Kim Date: Mon, 17 Feb 2025 14:07:27 +0900 Subject: [PATCH] Select input_ids explicitly after panda conversion (#2335) Without selecting the column, applying `len` counts the whole row as 1 which resulting the total number of the samples instead of the token counts. --- src/axolotl/utils/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index c8e365fc5..8553339b9 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -396,8 +396,8 @@ def calculate_total_num_steps(cfg, train_dataset, update=True): ): total_num_tokens = np.sum( train_dataset.select_columns("input_ids") - .to_pandas() - .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda + .to_pandas()["input_ids"] + .apply(len) .values ) LOG.debug(f"total_num_tokens: {total_num_tokens:_}", main_process_only=True)