use max of 32 dataset processes if not explicit (#2403)
* use max of 32 dataset processes if not explicit * change alternate min val for consistency
This commit is contained in:
@@ -43,7 +43,7 @@ class TokenizedChatDataset(Dataset):
|
||||
process_or_cpu_count: int = (
|
||||
process_count or os.cpu_count() # type: ignore[assignment]
|
||||
)
|
||||
num_proc = min(64, process_or_cpu_count)
|
||||
num_proc = min(32, process_or_cpu_count)
|
||||
features = data.features.keys()
|
||||
tokenized_data = data.map(
|
||||
map_fn,
|
||||
|
||||
@@ -728,7 +728,7 @@ class AxolotlInputConfig(
|
||||
default=None,
|
||||
json_schema_extra={"description": "streaming dataset to use for pretraining"},
|
||||
)
|
||||
dataset_processes: Optional[int] = Field(default=os.cpu_count())
|
||||
dataset_processes: Optional[int] = Field(default=min(32, os.cpu_count())) # type: ignore[type-var]
|
||||
dataset_exact_deduplication: Optional[bool] = None
|
||||
dataset_keep_in_memory: Optional[bool] = None
|
||||
dataloader_pin_memory: Optional[bool] = None
|
||||
|
||||
Reference in New Issue
Block a user