remove columns after tokenizing for pretraining (#571)

This commit is contained in:
Wing Lian
2023-09-14 11:08:22 -04:00
committed by GitHub
parent 3b18c963cc
commit 115795079d

View File

@@ -644,8 +644,8 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
encode,
batched=True,
input_columns="text",
remove_columns=[
"text",
],
# remove all the existing columns after mapping since they end up having
# a different length than the encoded/tokenized column
remove_columns=dataset.features.keys(),
)
return dataset