From 2e57391bf81a2cfcad9eebe22afb38990a13881f Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 14 Feb 2025 05:28:21 +0700
Subject: [PATCH] fix: add missing shards_idx, preprocess_shards to docs and
 validator (#2331)

---
 docs/config.qmd                                          | 7 ++++++-
 src/axolotl/utils/config/models/input/v0_4_1/__init__.py | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/config.qmd b/docs/config.qmd
index 327f4ae6f..5221cbe7d 100644
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -91,7 +91,12 @@ datasets:
     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
     data_files: # Optional[str] path to source data files
-    shards: # Optional[int] number of shards to split data into
+
+    shards: # Optional[int] split dataset into N pieces (use with shards_idx)
+    shards_idx: # Optional[int] = 0 the index of sharded dataset to use
+
+    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)
+
     name: # Optional[str] name of dataset configuration to load
     train_on_split: train # Optional[str] name of dataset split to load from
     revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
index 868328b0b..1f6fdc612 100644
--- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -169,6 +169,7 @@ class SFTDataset(BaseModel):
     type: Optional[Union[str, UserDefinedPrompterType]] = None
     input_transform: Optional[str] = None
     shards: Optional[int] = None
+    shards_idx: Optional[int] = None
     preprocess_shards: Optional[int] = None
     conversation: Optional[str] = None
     # Do not make this too strict or it will break the validator to choose different dataset class