From 6b3cdfdb8e26b2036cd54d0f457817be80e55c8a Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Thu, 10 Oct 2024 17:57:11 +0700 Subject: [PATCH] feat(doc): updated config with chat template options and clarified examples --- docs/config.qmd | 43 ++++++++++++- docs/dataset-formats/conversation.qmd | 93 +++++---------------------- 2 files changed, 59 insertions(+), 77 deletions(-) diff --git a/docs/config.qmd b/docs/config.qmd index 0e536d858..d8f925dbe 100644 --- a/docs/config.qmd +++ b/docs/config.qmd @@ -83,7 +83,7 @@ lora_on_cpu: true datasets: # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files - path: vicgalle/alpaca-gpt4 - # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] + # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] type: alpaca # format | format: (chat/instruct) | .load_ ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file data_files: # Optional[str] path to source data files @@ -123,6 +123,47 @@ datasets: # For `completion` datsets only, uses the provided field instead of `text` column field: + # Using chat template + - path: ... + # Set type to `chat_template` to use this strategy + type: chat_template + # Specify the name of the chat template to use + # The name of the chat template to use for training, following values are supported: + # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default. + # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py + # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml. + # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. + chat_template: tokenizer_default + # Custom jinja template for chat template. This will be only used if `chat_template` is set to `jinja` or empty (in which case chat_template is automatically set to `jinja`). + chat_template_jinja: + # The key in the data example that contains the messages. Default is "messages". + field_messages: messages + # The key in the message turn that contains the role. Default is "role". + message_field_role: role + # The key in the message turn that contains the content. Default is "content". + message_field_content: content + # Optional[Dict[str, List]]. Roles mapping for the messages. + roles: + user: ["human", "user"] + assistant: ["gpt", "assistant", "ai"] + system: ["system"] + + ## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on. + + # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss. + roles_to_train: ["gpt", "assistant"] + # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are: + # - all: train on all EOS tokens + # - turn: train on the EOS token at the end of each trainable turn + # - last: train on the last EOS token in the conversation + train_on_eos: last + # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`. + message_field_training: training + # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn. + # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train). + message_field_training_detail: train_detail + + # If false, the datasets will not be shuffled and will keep their original order in `datasets`. # The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true. shuffle_merged_datasets: true diff --git a/docs/dataset-formats/conversation.qmd b/docs/dataset-formats/conversation.qmd index c1d0522d1..9a35d3bad 100644 --- a/docs/dataset-formats/conversation.qmd +++ b/docs/dataset-formats/conversation.qmd @@ -73,81 +73,36 @@ creates a chat where bot is asked to tell a joke, then explain why the joke is f ## chat_template -Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Usually this chat template is stored in tokenizer_config.json under the key `chat_template`. - -Conversational data would normally look like follows: +Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2. ```{.json filename="data.jsonl"} -{"conversations": [{"from": "...", "value": "..."}]} +{"conversations": [{"role": "...", "content": "..."}]} ``` -with roles usually being system, user, assistant, etc. -However, all fields can be customized using the following configuration: - -```yaml -datasets: - - path: ... - # Set type to `chat_template` to use this strategy - type: chat_template - # Specify the name of the chat template to use - # The name of the chat template to use for training, following values are supported: - # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value. - # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py - # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer. - # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. - chat_template: tokenizer_default - # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null. - chat_template_jinja: null - # The key in the data example that contains the messages. Default is "conversations". - field_messages: conversations - # The key in the message turn that contains the role. Default is "from". - message_field_role: from - # The key in the message turn that contains the content. Default is "value". - message_field_content: value - # Role mapping for the messages. This can be useful if you are combining data from multiple sources and the roles are different. - roles: - human: user - user: user - assistant: assistant - gpt: assistant - system: system - # Roles to train on. The tokens from these roles will be considered for the loss. Default is ["gpt", "assistant"] - roles_to_train: ["gpt", "assistant"] - # Which EOS tokens to train on in the conversation. Possible values are: - # - all: train on all EOS tokens - # - turn: train on the EOS token at the end of each trainable turn - # - last: train on the last EOS token in the conversation - # - none: do not train on EOS tokens - # Default is "turn". - train_on_eos: turn - # The key in the message turn that indicates if tokens of a turn should be considered for training. This is an advanced option useful to selectively train on certain turns besides the `roles_to_train`. Default is "training". - message_field_training: training - # The key in the message turn that contains the training details. This is an advanced option useful to selectively train on certain tokens in a turn. Default is "train_detail". - message_field_training_detail: train_detail -``` +See `config.qmd` for full configs and supported templates. ### Examples -1. Using the default chat template in the tokenizer_config.json on OpenAI messages format +1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message. + +```yaml +datasets: + - path: ... + type: chat_template + chat_template: tokenizer_default +``` + +2. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on all assistant messages. ```yaml datasets: - path: ... type: chat_template chat_template: tokenizer_default - field_messages: messages - message_field_role: role - message_field_content: content - roles: - user: user - assistant: assistant - human: user - gpt: assistant - system: system roles_to_train: ["assistant"] ``` -2. Using a custom jinja template on OpenAI messages format +3. Using a custom jinja template on OpenAI messages format, training on all assistant messages. ```yaml datasets: @@ -155,20 +110,10 @@ datasets: type: chat_template chat_template: jinja chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}" - field_messages: messages - message_field_role: role - message_field_content: content - roles: - user: user - assistant: assistant - human: user - gpt: assistant - system: system roles_to_train: ["assistant"] ``` -3. Using fine-grained control over tokens and turns to train in a conversation - +4. (Advanced) Using fine-grained control over tokens and turns to train in a conversation For a data sample that looks like: @@ -207,14 +152,10 @@ datasets: field_messages: conversations message_field_role: from message_field_content: value - roles: - human: human - user: human - assistant: assistant - gpt: assistant - system: system roles_to_train: [] train_on_eos: turn message_field_training: train message_field_training_detail: train_detail ``` + +Tip: It is not necessary to use both `message_field_training` and `message_field_training_detail` at a time.