From 9e64f42e0fe2f3a5075cf516c8ea0d95837e1ff5 Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 6 Jul 2023 23:08:09 +0900 Subject: [PATCH] Fix local path loading and custom strategy type --- README.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index e45ac54b7..88e8b28ca 100644 --- a/README.md +++ b/README.md @@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended): #### How to add custom prompts 1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example. - 2. Use your custom file name as the dataset type. + 2. Use your custom file name as the dataset type `.load_`. Optionally, download some datasets, see [data/README.md](data/README.md) @@ -255,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic - dataset ```yaml + sequence_len: 2048 # max token length for prompt + + # huggingface repo datasets: - - path: vicgalle/alpaca-gpt4 # local or huggingface repo + - path: vicgalle/alpaca-gpt4 + type: alpaca # format from earlier + + # local + datasets: + - path: json + data_files: data.jsonl # or json type: alpaca # format from earlier - sequence_len: 2048 # max token length / prompt ``` - loading @@ -328,10 +336,10 @@ tf32: true # require >=ampere # a list of one or more datasets to finetune the model with datasets: - # this can be either a hf dataset, or relative path + # hf dataset repo | "json" for local dataset, make sure to fill data_files - path: vicgalle/alpaca-gpt4 # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] - type: alpaca # format OR format:prompt_style (chat/instruct) + type: alpaca # format | format: (chat/instruct) | .load_ data_files: # path to source data files shards: # number of shards to split data into