From 8626b54aab930bafaa2a48d78ab0e40076358850 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Sat, 27 May 2023 23:51:17 +0900 Subject: [PATCH] Add `dataset_shard_num` and `dataset_shard_idx` --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 6064d1b21..c8874bd7f 100644 --- a/README.md +++ b/README.md @@ -209,6 +209,10 @@ dataset_prepared_path: data/last_run_prepared push_dataset_to_hub: # repo path # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc val_set_size: 0.04 +# Num shards for whole dataset +dataset_shard_num: +# Index of shard to use for whole dataset +dataset_shard_idx: # the maximum length of an input to train with, this should typically be less than 2048 # as most models have a token/context limit of 2048