Fix pretraining with iterable/streaming Dataset (#556)

* return without packing prep/len

* fix remove columns

* fix encode arguments

* add error when max steps not set

* fix test

---------

Co-authored-by: Jan Philipp Harries <jphme@users.noreply.github.com>
This commit is contained in:
Jan Philipp Harries
2023-09-13 06:16:40 +02:00
committed by GitHub
parent 9845c5e12d
commit 2f586d18db
3 changed files with 19 additions and 6 deletions

View File

@@ -35,7 +35,7 @@ class TestEncodePretraining(unittest.TestCase):
"hello, hello",
]
}
result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
result = encode_pretraining(self.tokenizer, self.max_tokens, examples["text"])
self.assertEqual(len(result["input_ids"]), 3)