From 115795079d46835a8e4390a427979a9b179e9ca0 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 14 Sep 2023 11:08:22 -0400 Subject: [PATCH] remove columns after tokenizing for pretraining (#571) --- src/axolotl/utils/data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index f024d19c4..7ad8b34ee 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -644,8 +644,8 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42): encode, batched=True, input_columns="text", - remove_columns=[ - "text", - ], + # remove all the existing columns after mapping since they end up having + # a different length than the encoded/tokenized column + remove_columns=dataset.features.keys(), ) return dataset