From 530a0c0bf0759d59e87684349f11f042657397fa Mon Sep 17 00:00:00 2001 From: tgoab Date: Tue, 10 Feb 2026 05:44:17 -0500 Subject: [PATCH] Changes from dataset_processes to dataset_num_proc (#3352) [skip ci] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * changes from dataset_processes to dataset_num_proc * deprecation message improved --------- Co-authored-by: Juliana Nieto Cárdenas --- .runpod/README.md | 2 +- .runpod/src/config/config.yaml | 7 ++----- src/axolotl/utils/datasets.py | 6 ++++++ tests/core/test_builders.py | 2 +- tests/e2e/test_streaming.py | 2 +- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.runpod/README.md b/.runpod/README.md index 8042f4f91..2cebaa5e7 100644 --- a/.runpod/README.md +++ b/.runpod/README.md @@ -123,7 +123,7 @@ datasets: | --------------------------------- | -------------------------- | ----------------------------------- | | `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset | | `push_dataset_to_hub` | `""` | Push dataset to HF hub | -| `dataset_processes` | `4` | Number of preprocessing processes | +| `dataset_num_proc` | `4` | Number of preprocessing processes | | `dataset_keep_in_memory` | `false` | Keep dataset in memory | | `shuffle_merged_datasets` | `true` | Shuffle merged datasets | | `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging | diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml index fde3730b2..b43c83dfe 100644 --- a/.runpod/src/config/config.yaml +++ b/.runpod/src/config/config.yaml @@ -39,7 +39,6 @@ # type: # linear | dynamic # factor: # float - # # Whether you are training a 4-bit GPTQ quantized model # gptq: true # gptq_groupsize: 128 # group size @@ -107,7 +106,7 @@ # push_dataset_to_hub: # repo path # # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` # # if not set. -# dataset_processes: # defaults to os.cpu_count() if not set +# dataset_num_proc: # defaults to os.cpu_count() if not set # # push checkpoints to hub # hub_model_id: # repo path to push finetuned model # # how to push checkpoints to hub @@ -349,8 +348,6 @@ # # Allow overwrite yml config using from cli # strict: - - base_model: ${BASE_MODEL} base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS} base_model_config: ${BASE_MODEL_CONFIG} @@ -409,7 +406,7 @@ chat_template_jinja: ${CHAT_TEMPLATE_JINJA} default_system_message: ${DEFAULT_SYSTEM_MESSAGE} dataset_prepared_path: ${DATASET_PREPARED_PATH} push_dataset_to_hub: ${PUSH_DATASET_TO_HUB} -dataset_processes: ${DATASET_PROCESSES} +dataset_num_proc: ${DATASET_NUM_PROC} dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY} hub_model_id: ${HUB_MODEL_ID} hub_strategy: ${HUB_STRATEGY} diff --git a/src/axolotl/utils/datasets.py b/src/axolotl/utils/datasets.py index 9b8a8e25a..19ad71640 100644 --- a/src/axolotl/utils/datasets.py +++ b/src/axolotl/utils/datasets.py @@ -1,12 +1,18 @@ """helper functions for datasets""" import os +from axolotl.utils.logging import get_logger +LOG = get_logger(__name__) def get_default_process_count(): if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"): return int(axolotl_dataset_num_proc) if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"): + LOG.warning( + "AXOLOTL_DATASET_PROCESSES and `dataset_processes` are deprecated and will be " + "removed in a future version. Please use `dataset_num_proc` instead." + ) return int(axolotl_dataset_processes) if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"): return int(runpod_cpu_count) diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py index 5f1481101..194950e15 100644 --- a/tests/core/test_builders.py +++ b/tests/core/test_builders.py @@ -79,7 +79,7 @@ def fixture_base_cfg(): "ddp_timeout": 1800, "ddp_bucket_cap_mb": 25, "ddp_broadcast_buffers": False, - "dataset_processes": 4, + "dataset_num_proc": 4, } ) diff --git a/tests/e2e/test_streaming.py b/tests/e2e/test_streaming.py index 125eb43eb..404fb53da 100644 --- a/tests/e2e/test_streaming.py +++ b/tests/e2e/test_streaming.py @@ -30,7 +30,7 @@ class TestStreamingDatasets: "sample_packing": sample_packing, "pretrain_multipack_attn": sample_packing, "streaming_multipack_buffer_size": 10000, - "dataset_processes": 1, + "dataset_num_proc": 1, "special_tokens": { "pad_token": "<|endoftext|>", },