diff --git a/docs/config.qmd b/docs/config.qmd index ac4c3fa4f..eab8d28ca 100644 --- a/docs/config.qmd +++ b/docs/config.qmd @@ -98,8 +98,10 @@ plugins: # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin # A list of one or more datasets to finetune the model with +# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets +# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats datasets: - # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files + # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory - path: vicgalle/alpaca-gpt4 # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection] type: alpaca # format | format: (chat/instruct) | .load_ @@ -221,7 +223,7 @@ datasets: # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true. shuffle_merged_datasets: true -Deduplicates datasets and test_datasets with identical entries. +# Deduplicates datasets and test_datasets with identical entries. dataset_exact_deduplication: true # A list of one or more datasets to eval the model with. diff --git a/docs/dataset-formats/index.qmd b/docs/dataset-formats/index.qmd index 9898bbc9b..a0113db07 100644 --- a/docs/dataset-formats/index.qmd +++ b/docs/dataset-formats/index.qmd @@ -36,10 +36,6 @@ It is typically recommended to save your dataset as `.jsonl` due to its flexibil Axolotl supports loading from a Hugging Face hub repo or from local files. -::: {.callout-important} -For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. -::: - ### Pre-training from Hugging Face hub datasets As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config: @@ -77,18 +73,21 @@ datasets: type: completion ``` -From local files (either example works): +From local files: ```yaml datasets: - path: A.jsonl type: completion - - path: json - data_files: ["A.jsonl", "B.jsonl", "C.jsonl"] + - path: B.jsonl type: completion ``` +::: {.callout-important} +For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR! +::: + ### Pre-training dataset configuration tips #### Setting max_steps diff --git a/docs/dataset_loading.qmd b/docs/dataset_loading.qmd index 09c8b0098..b78f86a98 100644 --- a/docs/dataset_loading.qmd +++ b/docs/dataset_loading.qmd @@ -54,7 +54,7 @@ datasets: #### Files -Usually, to load a JSON file, you would do something like this: +To load a JSON file, you would do something like this: ```python from datasets import load_dataset @@ -66,20 +66,12 @@ Which translates to the following config: ```yaml datasets: - - path: json - data_files: /path/to/your/file.jsonl -``` - -However, to make things easier, we have added a few shortcuts for loading local dataset files. - -You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file: - -```yaml -datasets: - - path: /path/to/your/file.jsonl + - path: data.json ds_type: json ``` +In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset. + This works for CSV, JSON, Parquet, and Arrow files. ::: {.callout-tip}