Fix(doc): clarify data loading for local datasets and splitting samples (#2726) [skip ci]

* fix(doc): remove incorrect json dataset loading method * fix(doc): clarify splitting only happens in completion mode * fix: update local file loading on config doc * fix: typo
2025-05-28 15:48:22 +07:00
parent 4a8af60d34
commit 3e6948be97
3 changed files with 14 additions and 21 deletions
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -98,8 +98,10 @@ plugins:
  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

 # A list of one or more datasets to finetune the model with
+# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
+# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
 datasets:
-  # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
  - path: vicgalle/alpaca-gpt4
    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -221,7 +223,7 @@ datasets:
 # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
 shuffle_merged_datasets: true

-Deduplicates datasets and test_datasets with identical entries.
+# Deduplicates datasets and test_datasets with identical entries.
 dataset_exact_deduplication: true

 # A list of one or more datasets to eval the model with.