update table for rwkv4 support, fix process count for dataset (#822)

This commit is contained in:
Wing Lian
2023-11-04 23:45:44 -04:00
committed by GitHub
parent 6459ac7357
commit cdc71f73c8
3 changed files with 39 additions and 12 deletions

View File

@@ -2,7 +2,7 @@
import logging
import os
from typing import List
from typing import List, Optional
import torch
from datasets import Dataset, IterableDataset
@@ -30,14 +30,20 @@ class TokenizedPromptDataset(Dataset):
self,
prompt_tokenizer: PromptTokenizingStrategy,
dataset: IterableDataset,
process_count: Optional[int] = None,
**kwargs,
):
self.prompt_tokenizer = prompt_tokenizer
self.process_count = process_count
super().__init__(self.process(dataset).data, **kwargs)
def process(self, dataset):
features = dataset.features.keys()
num_proc = min(64, os.cpu_count())
num_proc = (
min(64, self.process_count)
if self.process_count
else min(64, os.cpu_count())
)
map_kwargs = {}
if self.prompt_tokenizer.supports_batched:
map_kwargs["batched"] = True