From 9e64f42e0fe2f3a5075cf516c8ea0d95837e1ff5 Mon Sep 17 00:00:00 2001
From: NanoCode012 <kevinvong@rocketmail.com>
Date: Thu, 6 Jul 2023 23:08:09 +0900
Subject: [PATCH] Fix local path loading and custom strategy type

---
 README.md | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index e45ac54b7..88e8b28ca 100644
--- a/README.md
+++ b/README.md
@@ -237,7 +237,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
 #### How to add custom prompts
 
   1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example.
-  2. Use your custom file name as the dataset type.
+  2. Use your custom file name as the dataset type `<prompt_strategies_file>.load_<load_fn>`.
 
 Optionally, download some datasets, see [data/README.md](data/README.md)
 
@@ -255,10 +255,18 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
 
 - dataset
   ```yaml
+  sequence_len: 2048 # max token length for prompt
+  
+  # huggingface repo 
   datasets:
-    - path: vicgalle/alpaca-gpt4 # local or huggingface repo
+    - path: vicgalle/alpaca-gpt4
+      type: alpaca # format from earlier
+
+  # local
+  datasets:
+    - path: json
+      data_files: data.jsonl # or json
       type: alpaca # format from earlier
-  sequence_len: 2048 # max token length / prompt
   ```
 
 - loading
@@ -328,10 +336,10 @@ tf32: true # require >=ampere
 
 # a list of one or more datasets to finetune the model with
 datasets:
-  # this can be either a hf dataset, or relative path
+  # hf dataset repo | "json" for local dataset, make sure to fill data_files
   - path: vicgalle/alpaca-gpt4
   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-    type: alpaca # format OR format:prompt_style (chat/instruct)
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
     data_files: # path to source data files
     shards: # number of shards to split data into