Changes from dataset_processes to dataset_num_proc (#3352) [skip ci]
* changes from dataset_processes to dataset_num_proc * deprecation message improved --------- Co-authored-by: Juliana Nieto Cárdenas <jnietoca@purdue.edu>
This commit is contained in:
@@ -123,7 +123,7 @@ datasets:
|
|||||||
| --------------------------------- | -------------------------- | ----------------------------------- |
|
| --------------------------------- | -------------------------- | ----------------------------------- |
|
||||||
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
|
||||||
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
|
||||||
| `dataset_processes` | `4` | Number of preprocessing processes |
|
| `dataset_num_proc` | `4` | Number of preprocessing processes |
|
||||||
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
|
||||||
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
|
||||||
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
|
| `shuffle_before_merging_datasets` | `false` | Shuffle each dataset before merging |
|
||||||
|
|||||||
@@ -39,7 +39,6 @@
|
|||||||
# type: # linear | dynamic
|
# type: # linear | dynamic
|
||||||
# factor: # float
|
# factor: # float
|
||||||
|
|
||||||
|
|
||||||
# # Whether you are training a 4-bit GPTQ quantized model
|
# # Whether you are training a 4-bit GPTQ quantized model
|
||||||
# gptq: true
|
# gptq: true
|
||||||
# gptq_groupsize: 128 # group size
|
# gptq_groupsize: 128 # group size
|
||||||
@@ -107,7 +106,7 @@
|
|||||||
# push_dataset_to_hub: # repo path
|
# push_dataset_to_hub: # repo path
|
||||||
# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
|
# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
|
||||||
# # if not set.
|
# # if not set.
|
||||||
# dataset_processes: # defaults to os.cpu_count() if not set
|
# dataset_num_proc: # defaults to os.cpu_count() if not set
|
||||||
# # push checkpoints to hub
|
# # push checkpoints to hub
|
||||||
# hub_model_id: # repo path to push finetuned model
|
# hub_model_id: # repo path to push finetuned model
|
||||||
# # how to push checkpoints to hub
|
# # how to push checkpoints to hub
|
||||||
@@ -349,8 +348,6 @@
|
|||||||
# # Allow overwrite yml config using from cli
|
# # Allow overwrite yml config using from cli
|
||||||
# strict:
|
# strict:
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
base_model: ${BASE_MODEL}
|
base_model: ${BASE_MODEL}
|
||||||
base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
|
base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
|
||||||
base_model_config: ${BASE_MODEL_CONFIG}
|
base_model_config: ${BASE_MODEL_CONFIG}
|
||||||
@@ -409,7 +406,7 @@ chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
|
|||||||
default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
|
default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
|
||||||
dataset_prepared_path: ${DATASET_PREPARED_PATH}
|
dataset_prepared_path: ${DATASET_PREPARED_PATH}
|
||||||
push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
|
push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
|
||||||
dataset_processes: ${DATASET_PROCESSES}
|
dataset_num_proc: ${DATASET_NUM_PROC}
|
||||||
dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
|
dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
|
||||||
hub_model_id: ${HUB_MODEL_ID}
|
hub_model_id: ${HUB_MODEL_ID}
|
||||||
hub_strategy: ${HUB_STRATEGY}
|
hub_strategy: ${HUB_STRATEGY}
|
||||||
|
|||||||
@@ -1,12 +1,18 @@
|
|||||||
"""helper functions for datasets"""
|
"""helper functions for datasets"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
from axolotl.utils.logging import get_logger
|
||||||
|
|
||||||
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
def get_default_process_count():
|
def get_default_process_count():
|
||||||
if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
|
if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
|
||||||
return int(axolotl_dataset_num_proc)
|
return int(axolotl_dataset_num_proc)
|
||||||
if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"):
|
if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"):
|
||||||
|
LOG.warning(
|
||||||
|
"AXOLOTL_DATASET_PROCESSES and `dataset_processes` are deprecated and will be "
|
||||||
|
"removed in a future version. Please use `dataset_num_proc` instead."
|
||||||
|
)
|
||||||
return int(axolotl_dataset_processes)
|
return int(axolotl_dataset_processes)
|
||||||
if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"):
|
if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"):
|
||||||
return int(runpod_cpu_count)
|
return int(runpod_cpu_count)
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ def fixture_base_cfg():
|
|||||||
"ddp_timeout": 1800,
|
"ddp_timeout": 1800,
|
||||||
"ddp_bucket_cap_mb": 25,
|
"ddp_bucket_cap_mb": 25,
|
||||||
"ddp_broadcast_buffers": False,
|
"ddp_broadcast_buffers": False,
|
||||||
"dataset_processes": 4,
|
"dataset_num_proc": 4,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class TestStreamingDatasets:
|
|||||||
"sample_packing": sample_packing,
|
"sample_packing": sample_packing,
|
||||||
"pretrain_multipack_attn": sample_packing,
|
"pretrain_multipack_attn": sample_packing,
|
||||||
"streaming_multipack_buffer_size": 10000,
|
"streaming_multipack_buffer_size": 10000,
|
||||||
"dataset_processes": 1,
|
"dataset_num_proc": 1,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>",
|
"pad_token": "<|endoftext|>",
|
||||||
},
|
},
|
||||||
|
|||||||
Reference in New Issue
Block a user