From 2e57391bf81a2cfcad9eebe22afb38990a13881f Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Fri, 14 Feb 2025 05:28:21 +0700 Subject: [PATCH] fix: add missing shards_idx, preprocess_shards to docs and validator (#2331) --- docs/config.qmd | 7 ++++++- src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/config.qmd b/docs/config.qmd index 327f4ae6f..5221cbe7d 100644 --- a/docs/config.qmd +++ b/docs/config.qmd @@ -91,7 +91,12 @@ datasets: type: alpaca # format | format: (chat/instruct) | .load_ ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file data_files: # Optional[str] path to source data files - shards: # Optional[int] number of shards to split data into + + shards: # Optional[int] split dataset into N pieces (use with shards_idx) + shards_idx: # Optional[int] = 0 the index of sharded dataset to use + + preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`) + name: # Optional[str] name of dataset configuration to load train_on_split: train # Optional[str] name of dataset split to load from revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets. diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 868328b0b..1f6fdc612 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -169,6 +169,7 @@ class SFTDataset(BaseModel): type: Optional[Union[str, UserDefinedPrompterType]] = None input_transform: Optional[str] = None shards: Optional[int] = None + shards_idx: Optional[int] = None preprocess_shards: Optional[int] = None conversation: Optional[str] = None # Do not make this too strict or it will break the validator to choose different dataset class