remove columns after tokenizing for pretraining (#571)

2023-09-14 11:08:22 -04:00
parent 3b18c963cc
commit 115795079d
1 changed files with 3 additions and 3 deletions
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -644,8 +644,8 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
        encode,
        batched=True,
        input_columns="text",
-        remove_columns=[
-            "text",
-        ],
+        # remove all the existing columns after mapping since they end up having
+        # a different length than the encoded/tokenized column
+        remove_columns=dataset.features.keys(),
    )
    return dataset