fixes to accelerator so that iterable pretraining datasets work (#1759)

* fixes to accelerator so that iterable pretraining datasets work

* fix the pretraining test params

* split batches, not dispatch batches needs to be set

* update c4 datasets

* set epochs in pretrain config test

* need to set both split_batches and dispatch_batches to false for pretraining

* fix bool val in comment
This commit is contained in:
Wing Lian
2024-07-17 10:58:38 -04:00
committed by GitHub
parent 152ab76623
commit 976f85195a
4 changed files with 98 additions and 2 deletions

View File

@@ -24,7 +24,7 @@ class TestPretrainingPacking(unittest.TestCase):
def test_packing_stream_dataset(self):
# pylint: disable=duplicate-code
dataset = load_dataset(
"c4",
"allenai/c4",
"en",
streaming=True,
)["train"]
@@ -33,7 +33,7 @@ class TestPretrainingPacking(unittest.TestCase):
{
"pretraining_dataset": [
{
"path": "c4",
"path": "allenai/c4",
"name": "en",
"type": "pretrain",
}