fixes to accelerator so that iterable pretraining datasets work (#1759)
* fixes to accelerator so that iterable pretraining datasets work * fix the pretraining test params * split batches, not dispatch batches needs to be set * update c4 datasets * set epochs in pretrain config test * need to set both split_batches and dispatch_batches to false for pretraining * fix bool val in comment
This commit is contained in:
@@ -24,7 +24,7 @@ class TestPretrainingPacking(unittest.TestCase):
|
||||
def test_packing_stream_dataset(self):
|
||||
# pylint: disable=duplicate-code
|
||||
dataset = load_dataset(
|
||||
"c4",
|
||||
"allenai/c4",
|
||||
"en",
|
||||
streaming=True,
|
||||
)["train"]
|
||||
@@ -33,7 +33,7 @@ class TestPretrainingPacking(unittest.TestCase):
|
||||
{
|
||||
"pretraining_dataset": [
|
||||
{
|
||||
"path": "c4",
|
||||
"path": "allenai/c4",
|
||||
"name": "en",
|
||||
"type": "pretrain",
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user