Preprocess dataset size fix (#1131)
* overwrite cache on preprocess step * don't cache the TokenizedPromptDataset at all * load_from_cache_file no longer needed
This commit is contained in:
@@ -35,7 +35,10 @@ class TokenizedPromptDataset(Dataset):
|
||||
):
|
||||
self.prompt_tokenizer = prompt_tokenizer
|
||||
self.process_count = process_count
|
||||
super().__init__(self.process(dataset).data, **kwargs)
|
||||
super().__init__(
|
||||
self.process(dataset).data,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def process(self, dataset):
|
||||
features = dataset.features.keys()
|
||||
@@ -52,6 +55,7 @@ class TokenizedPromptDataset(Dataset):
|
||||
self.prompt_tokenizer.tokenize_prompt,
|
||||
num_proc=num_proc,
|
||||
remove_columns=features,
|
||||
keep_in_memory=True,
|
||||
**map_kwargs,
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user