From f8ae59b0a89a8db5c7c956a91979971cefd8350c Mon Sep 17 00:00:00 2001 From: mhenrichsen Date: Fri, 29 Dec 2023 22:44:23 +0100 Subject: [PATCH] Adds chat templates (#1022) --- README.md | 3 +++ src/axolotl/utils/chat_templates.py | 29 +++++++++++++++++++++++++++++ src/axolotl/utils/models.py | 7 +++++++ 3 files changed, 39 insertions(+) create mode 100644 src/axolotl/utils/chat_templates.py diff --git a/README.md b/README.md index d15c4c001..98b8a7823 100644 --- a/README.md +++ b/README.md @@ -589,6 +589,9 @@ datasets: # For `completion` datsets only, uses the provided field instead of `text` column field: +# Saves the desired chat template to the tokenizer_config.json for easier inferencing +# Currently supports chatml and inst (mistral/mixtral) +chat_template: chatml # Axolotl attempts to save the dataset as an arrow after packing the data together so # subsequent training attempts load faster, relative path dataset_prepared_path: data/last_run_prepared diff --git a/src/axolotl/utils/chat_templates.py b/src/axolotl/utils/chat_templates.py new file mode 100644 index 000000000..459da4400 --- /dev/null +++ b/src/axolotl/utils/chat_templates.py @@ -0,0 +1,29 @@ +""" +This module provides functionality for selecting chat templates based on user choices. +These templates are used for formatting messages in a conversation. +""" + + +def chat_templates(user_choice: str): + """ + Finds the correct chat_template for the tokenizer_config. + + Args: + user_choice (str): The user's choice of template. + + Returns: + str: The chosen template string. + + Raises: + ValueError: If the user_choice is not found in the templates. + """ + + templates = { + "inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", # I don't know what this one is called. Used by Mistral/Mixtral. + "chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + } + + if user_choice in templates: + return templates[user_choice] + + raise ValueError(f"Template '{user_choice}' not found.") diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index c2b3a758c..df6907c3d 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -26,6 +26,7 @@ from transformers.deepspeed import is_deepspeed_zero3_enabled from axolotl.models.mamba import fix_mamba_attn_for_loss from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN from axolotl.utils.bench import log_gpu_memory_usage +from axolotl.utils.chat_templates import chat_templates from axolotl.utils.dict import DictDefault LOG = logging.getLogger("axolotl") @@ -186,6 +187,12 @@ def load_tokenizer(cfg): LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}") LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}") + if cfg.chat_template: + tokenizer.chat_template = chat_templates(cfg.chat_template) + else: + LOG.info( + "No Chat template selected. Consider adding a chat template for easier inference." + ) return tokenizer