From 24aa6b15a032714a308a28d3772be44ee0117e88 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Mon, 14 Oct 2024 12:21:58 +0700
Subject: [PATCH] feat: handle sharegpt deprecation better in docs

---
 README.md                                 |  2 +-
 docs/dataset-formats/conversation.qmd     | 51 ++++++++++++++++++++---
 src/axolotl/prompt_strategies/sharegpt.py |  2 +-
 3 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 4ce7a351b..21b954a56 100644
--- a/README.md
+++ b/README.md
@@ -383,7 +383,7 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
         - typescript
       type: ... # unimplemented custom format
 
-      # fastchat conversation (deprecation soon, use chat_template)
+      # fastchat conversation (deprecation soon, use chat_template https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template)
       # See 'conversation' options: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
     - path: ...
       type: sharegpt
diff --git a/docs/dataset-formats/conversation.qmd b/docs/dataset-formats/conversation.qmd
index 6dfafa2c2..1e2454efe 100644
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -6,6 +6,8 @@ order: 3
 
 ## sharegpt
 
+UPDATE: ShareGPT is being deprecated in the next release. Please see `chat_template` section below.
+
 conversations where `from` is `human`/`gpt`. (optional: first row with role `system` to override default system prompt)
 
 ```{.json filename="data.jsonl"}
@@ -81,6 +83,42 @@ Chat Template strategy uses a jinja2 template that converts a list of messages i
 
 See `config.qmd` for full configs and supported templates.
 
+### Migrating from sharegpt
+
+Most configs can be adapted as follows:
+
+```yaml
+# old
+chat_template: chatml
+datasets:
+  - path: ...
+    type: sharegpt
+    conversation: chatml
+
+# new (if using tokenizer's chat_template)
+```yaml
+datasets:
+  - path: ...
+    type: chat_template
+
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+
+# new (if setting a new chat_template like chatml, gemma, etc)
+```yaml
+chat_template: chatml
+datasets:
+  - path: ...
+    type: chat_template
+
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+```
+
+We recommend checking the below examples for other usecases.
+
 ### Examples
 
 1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
@@ -89,28 +127,28 @@ See `config.qmd` for full configs and supported templates.
 datasets:
   - path: ...
     type: chat_template
-    chat_template: tokenizer_default
+    # chat_template: tokenizer_default # This is the default value
 ```
 
-2. Using the `gemma` chat template in the tokenizer_config.json on OpenAI messages format, training on all assistant messages.
+2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
 
 ```yaml
-chat_template: gemma
+chat_template: gemma # this overwrites the tokenizer's chat_template
 datasets:
   - path: ...
     type: chat_template
-    chat_template: gemma
     roles_to_train: ["assistant"]
 ```
 
 3. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
 
 ```yaml
+# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
+chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
+
 datasets:
   - path: ...
     type: chat_template
-    chat_template: jinja
-    chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
     roles_to_train: ["assistant"]
 ```
 
@@ -149,6 +187,7 @@ The configuration would look like:
 ```yaml
 datasets:
   - path: ...
+    type: chat_template
     chat_template: tokenizer_default
     field_messages: conversations
     message_field_role: from
diff --git a/src/axolotl/prompt_strategies/sharegpt.py b/src/axolotl/prompt_strategies/sharegpt.py
index 4565c35d5..069d243f5 100644
--- a/src/axolotl/prompt_strategies/sharegpt.py
+++ b/src/axolotl/prompt_strategies/sharegpt.py
@@ -62,7 +62,7 @@ def build_loader(
 ):
     def _load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
         LOG.warning(
-            "sharegpt type support will be deprecated in the next release of Axolotl. Please use chat_template instead.",
+            "sharegpt type support will be deprecated in the next release of Axolotl. Please use chat_template instead. https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html#chat_template",
         )
         conversation = (
             ds_cfg["conversation"]