Fix(doc): clarify data loading for local datasets and splitting samples (#2726) [skip ci]

* fix(doc): remove incorrect json dataset loading method

* fix(doc): clarify splitting only happens in completion mode

* fix: update local file loading on config doc

* fix: typo
This commit is contained in:
NanoCode012
2025-05-28 15:48:22 +07:00
committed by GitHub
parent 4a8af60d34
commit 3e6948be97
3 changed files with 14 additions and 21 deletions

View File

@@ -98,8 +98,10 @@ plugins:
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
# A list of one or more datasets to finetune the model with
# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
datasets:
# HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
# HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
- path: vicgalle/alpaca-gpt4
# The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
@@ -221,7 +223,7 @@ datasets:
# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
shuffle_merged_datasets: true
Deduplicates datasets and test_datasets with identical entries.
# Deduplicates datasets and test_datasets with identical entries.
dataset_exact_deduplication: true
# A list of one or more datasets to eval the model with.