more fixes for dataloader integration

2023-07-18 10:50:40 -04:00
parent 762f1b08db
commit 41d4992029
2 changed files with 14 additions and 14 deletions
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -231,7 +231,7 @@ def train(
                cfg.pretraining_dataset,
                tokenizer,
                max_tokens=cfg.sequence_len,
-                seed=cfg.seed,
+                seed=cfg.seed or 42,
            )
            # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
            train_dataset = train_dataset.with_format("torch")