feat(dataset): add config to keep processed dataset in memory (#1152)

2024-01-20 13:19:28 +09:00
parent cbecf3e62a
commit 3db5f2fd17
3 changed files with 25 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -618,6 +618,9 @@ push_dataset_to_hub: # repo path
 # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # if not set.
 dataset_processes: # defaults to os.cpu_count() if not set
+# Keep dataset in memory while preprocessing
+# Only needed if cached dataset is taking too much storage
+dataset_keep_in_memory:
 # push checkpoints to hub
 hub_model_id: # repo path to push finetuned model
 # how to push checkpoints to hub