From f0072f3b9d580e8e967e8189f8a270494ace7f85 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 11 Mar 2025 12:02:58 -0400 Subject: [PATCH] use max of 32 dataset processes if not explicit (#2403) * use max of 32 dataset processes if not explicit * change alternate min val for consistency --- src/axolotl/core/datasets/chat.py | 2 +- src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/axolotl/core/datasets/chat.py b/src/axolotl/core/datasets/chat.py index e74c247d2..ba257071d 100644 --- a/src/axolotl/core/datasets/chat.py +++ b/src/axolotl/core/datasets/chat.py @@ -43,7 +43,7 @@ class TokenizedChatDataset(Dataset): process_or_cpu_count: int = ( process_count or os.cpu_count() # type: ignore[assignment] ) - num_proc = min(64, process_or_cpu_count) + num_proc = min(32, process_or_cpu_count) features = data.features.keys() tokenized_data = data.map( map_fn, diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 5cc3fbc4f..921a015d3 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -728,7 +728,7 @@ class AxolotlInputConfig( default=None, json_schema_extra={"description": "streaming dataset to use for pretraining"}, ) - dataset_processes: Optional[int] = Field(default=os.cpu_count()) + dataset_processes: Optional[int] = Field(default=min(32, os.cpu_count())) # type: ignore[type-var] dataset_exact_deduplication: Optional[bool] = None dataset_keep_in_memory: Optional[bool] = None dataloader_pin_memory: Optional[bool] = None