1196 lines
82 KiB
HTML
1196 lines
82 KiB
HTML
<!DOCTYPE html>
|
||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
||
|
||
<meta charset="utf-8">
|
||
<meta name="generator" content="quarto-1.6.40">
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||
|
||
<meta name="description" content="Guide to Dataset Formats in Axolotl">
|
||
|
||
<title>Dataset Formats – Axolotl</title>
|
||
<style>
|
||
code{white-space: pre-wrap;}
|
||
span.smallcaps{font-variant: small-caps;}
|
||
div.columns{display: flex; gap: min(4vw, 1.5em);}
|
||
div.column{flex: auto; overflow-x: auto;}
|
||
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
|
||
ul.task-list{list-style: none;}
|
||
ul.task-list li input[type="checkbox"] {
|
||
width: 0.8em;
|
||
margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */
|
||
vertical-align: middle;
|
||
}
|
||
/* CSS for syntax highlighting */
|
||
pre > code.sourceCode { white-space: pre; position: relative; }
|
||
pre > code.sourceCode > span { line-height: 1.25; }
|
||
pre > code.sourceCode > span:empty { height: 1.2em; }
|
||
.sourceCode { overflow: visible; }
|
||
code.sourceCode > span { color: inherit; text-decoration: inherit; }
|
||
div.sourceCode { margin: 1em 0; }
|
||
pre.sourceCode { margin: 0; }
|
||
@media screen {
|
||
div.sourceCode { overflow: auto; }
|
||
}
|
||
@media print {
|
||
pre > code.sourceCode { white-space: pre-wrap; }
|
||
pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; }
|
||
}
|
||
pre.numberSource code
|
||
{ counter-reset: source-line 0; }
|
||
pre.numberSource code > span
|
||
{ position: relative; left: -4em; counter-increment: source-line; }
|
||
pre.numberSource code > span > a:first-child::before
|
||
{ content: counter(source-line);
|
||
position: relative; left: -1em; text-align: right; vertical-align: baseline;
|
||
border: none; display: inline-block;
|
||
-webkit-touch-callout: none; -webkit-user-select: none;
|
||
-khtml-user-select: none; -moz-user-select: none;
|
||
-ms-user-select: none; user-select: none;
|
||
padding: 0 4px; width: 4em;
|
||
}
|
||
pre.numberSource { margin-left: 3em; padding-left: 4px; }
|
||
div.sourceCode
|
||
{ }
|
||
@media screen {
|
||
pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
|
||
}
|
||
</style>
|
||
|
||
|
||
<script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
|
||
<script src="../../site_libs/clipboard/clipboard.min.js"></script>
|
||
<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
|
||
<script src="../../site_libs/quarto-search/fuse.min.js"></script>
|
||
<script src="../../site_libs/quarto-search/quarto-search.js"></script>
|
||
<meta name="quarto:offset" content="../../">
|
||
<link href="../../favicon.jpg" rel="icon" type="image/jpeg">
|
||
<script src="../../site_libs/quarto-html/quarto.js"></script>
|
||
<script src="../../site_libs/quarto-html/popper.min.js"></script>
|
||
<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
|
||
<script src="../../site_libs/quarto-html/anchor.min.js"></script>
|
||
<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
|
||
<link href="../../site_libs/quarto-html/quarto-syntax-highlighting-549806ee2085284f45b00abea8c6df48.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||
<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
|
||
<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
|
||
<link href="../../site_libs/bootstrap/bootstrap-141b2cdb37a94fcfd6825c1581ff795f.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="light">
|
||
<script id="quarto-search-options" type="application/json">{
|
||
"location": "navbar",
|
||
"copy-button": false,
|
||
"collapse-after": 3,
|
||
"panel-placement": "end",
|
||
"type": "overlay",
|
||
"limit": 50,
|
||
"keyboard-shortcut": [
|
||
"f",
|
||
"/",
|
||
"s"
|
||
],
|
||
"show-item-context": false,
|
||
"language": {
|
||
"search-no-results-text": "No results",
|
||
"search-matching-documents-text": "matching documents",
|
||
"search-copy-link-title": "Copy link to search",
|
||
"search-hide-matches-text": "Hide additional matches",
|
||
"search-more-match-text": "more match in this document",
|
||
"search-more-matches-text": "more matches in this document",
|
||
"search-clear-button-title": "Clear",
|
||
"search-text-placeholder": "",
|
||
"search-detached-cancel-button-title": "Cancel",
|
||
"search-submit-button-title": "Submit",
|
||
"search-label": "Search"
|
||
}
|
||
}</script>
|
||
|
||
|
||
<link rel="stylesheet" href="../../styles.css">
|
||
</head>
|
||
|
||
<body class="nav-sidebar docked nav-fixed">
|
||
|
||
<div id="quarto-search-results"></div>
|
||
<header id="quarto-header" class="headroom fixed-top">
|
||
<nav class="navbar navbar-expand " data-bs-theme="dark">
|
||
<div class="navbar-container container-fluid">
|
||
<div class="navbar-brand-container mx-auto">
|
||
<a class="navbar-brand" href="../../index.html">
|
||
<span class="navbar-title">Axolotl</span>
|
||
</a>
|
||
</div>
|
||
<div class="quarto-navbar-tools tools-wide tools-end">
|
||
<a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
|
||
<a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
|
||
<a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
|
||
</div>
|
||
<div id="quarto-search" class="" title="Search"></div>
|
||
</div> <!-- /container-fluid -->
|
||
</nav>
|
||
<nav class="quarto-secondary-nav">
|
||
<div class="container-fluid d-flex">
|
||
<button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||
<i class="bi bi-layout-text-sidebar-reverse"></i>
|
||
</button>
|
||
<nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../../docs/dataset-formats/index.html">Dataset Formats</a></li></ol></nav>
|
||
<a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
|
||
</a>
|
||
</div>
|
||
</nav>
|
||
</header>
|
||
<!-- content -->
|
||
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
|
||
<!-- sidebar -->
|
||
<nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
|
||
<div class="sidebar-menu-container">
|
||
<ul class="list-unstyled mt-1">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../index.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Home</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">How-To Guides</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/getting-started.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Getting Started with Axolotl</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/installation.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Installation Guide</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/debugging.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Debugging</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/inference.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Inference Guide</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/multipack.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multipack (Sample Packing)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">FDSP + QLoRA</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/input_output.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Template-free prompt construction</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/rlhf.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">RLHF (Beta)</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/nccl.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">NCCL</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/mac.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Mac M-series</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multi-GPU Training Guide</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/multi-node.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Multi Node</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Unsloth</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Training with AMD GPUs on HPC Systems</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Ray Train integration</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link active">
|
||
<span class="menu-text">Dataset Formats</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Pre-training</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Instruction Tuning</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Conversation</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Stepwise Supervised Format</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Template-Free</span></a>
|
||
</div>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item sidebar-item-section">
|
||
<div class="sidebar-item-container">
|
||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true">
|
||
<span class="menu-text">Reference</span></a>
|
||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
|
||
<i class="bi bi-chevron-right ms-2"></i>
|
||
</a>
|
||
</div>
|
||
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/config.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">Config options</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="sidebar-item">
|
||
<div class="sidebar-item-container">
|
||
<a href="../../docs/faq.html" class="sidebar-item-text sidebar-link">
|
||
<span class="menu-text">FAQ</span></a>
|
||
</div>
|
||
</li>
|
||
</ul>
|
||
</div>
|
||
</nav>
|
||
<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
|
||
<!-- margin-sidebar -->
|
||
<div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
|
||
<nav id="TOC" role="doc-toc" class="toc-active">
|
||
<h2 id="toc-title">On this page</h2>
|
||
|
||
<ul>
|
||
<li><a href="#pre-training" id="toc-pre-training" class="nav-link active" data-scroll-target="#pre-training">Pre-training</a>
|
||
<ul>
|
||
<li><a href="#pre-training-from-hugging-face-hub-datasets" id="toc-pre-training-from-hugging-face-hub-datasets" class="nav-link" data-scroll-target="#pre-training-from-hugging-face-hub-datasets">Pre-training from Hugging Face hub datasets</a></li>
|
||
<li><a href="#pre-training-from-local-dataset-files" id="toc-pre-training-from-local-dataset-files" class="nav-link" data-scroll-target="#pre-training-from-local-dataset-files">Pre-training from local dataset files</a></li>
|
||
<li><a href="#pre-training-without-streaming" id="toc-pre-training-without-streaming" class="nav-link" data-scroll-target="#pre-training-without-streaming">Pre-training without streaming</a></li>
|
||
<li><a href="#pre-training-dataset-configuration-tips" id="toc-pre-training-dataset-configuration-tips" class="nav-link" data-scroll-target="#pre-training-dataset-configuration-tips">Pre-training dataset configuration tips</a>
|
||
<ul>
|
||
<li><a href="#setting-max_steps" id="toc-setting-max_steps" class="nav-link" data-scroll-target="#setting-max_steps">Setting max_steps</a></li>
|
||
<li><a href="#group_by_length" id="toc-group_by_length" class="nav-link" data-scroll-target="#group_by_length">Group_by_length</a></li>
|
||
</ul></li>
|
||
</ul></li>
|
||
<li><a href="#supervised-fine-tuning-sft" id="toc-supervised-fine-tuning-sft" class="nav-link" data-scroll-target="#supervised-fine-tuning-sft">Supervised fine-tuning (SFT)</a>
|
||
<ul>
|
||
<li><a href="#pre-tokenized-dataset" id="toc-pre-tokenized-dataset" class="nav-link" data-scroll-target="#pre-tokenized-dataset">Pre-Tokenized Dataset</a></li>
|
||
<li><a href="#template-free-dataset" id="toc-template-free-dataset" class="nav-link" data-scroll-target="#template-free-dataset">Template Free Dataset</a></li>
|
||
<li><a href="#conversation-dataset" id="toc-conversation-dataset" class="nav-link" data-scroll-target="#conversation-dataset">Conversation Dataset</a>
|
||
<ul>
|
||
<li><a href="#what-are-chat_templates" id="toc-what-are-chat_templates" class="nav-link" data-scroll-target="#what-are-chat_templates">What are <code>chat_templates</code>?</a></li>
|
||
<li><a href="#common-conversation-dataset-formats" id="toc-common-conversation-dataset-formats" class="nav-link" data-scroll-target="#common-conversation-dataset-formats">Common Conversation Dataset formats</a></li>
|
||
<li><a href="#chat-template-usage" id="toc-chat-template-usage" class="nav-link" data-scroll-target="#chat-template-usage">Chat Template Usage</a>
|
||
<ul class="collapse">
|
||
<li><a href="#choosing-a-chat_template" id="toc-choosing-a-chat_template" class="nav-link" data-scroll-target="#choosing-a-chat_template">Choosing a <code>chat_template</code></a></li>
|
||
<li><a href="#setting-chat_template-dataset-keys" id="toc-setting-chat_template-dataset-keys" class="nav-link" data-scroll-target="#setting-chat_template-dataset-keys">Setting <code>chat_template</code> dataset keys</a></li>
|
||
<li><a href="#handling-masking" id="toc-handling-masking" class="nav-link" data-scroll-target="#handling-masking">Handling masking</a></li>
|
||
</ul></li>
|
||
<li><a href="#applying-chat_template" id="toc-applying-chat_template" class="nav-link" data-scroll-target="#applying-chat_template">Applying <code>chat_template</code></a></li>
|
||
</ul></li>
|
||
<li><a href="#instruction-dataset" id="toc-instruction-dataset" class="nav-link" data-scroll-target="#instruction-dataset">Instruction Dataset</a>
|
||
<ul>
|
||
<li><a href="#custom-instruct-prompt-format" id="toc-custom-instruct-prompt-format" class="nav-link" data-scroll-target="#custom-instruct-prompt-format">Custom Instruct Prompt Format</a></li>
|
||
</ul></li>
|
||
</ul></li>
|
||
<li><a href="#reinforcement-learning-from-human-feedback-rlhf" id="toc-reinforcement-learning-from-human-feedback-rlhf" class="nav-link" data-scroll-target="#reinforcement-learning-from-human-feedback-rlhf">Reinforcement Learning from Human Feedback (RLHF)</a></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
<!-- main -->
|
||
<main class="content" id="quarto-document-content">
|
||
|
||
<header id="title-block-header" class="quarto-title-block default">
|
||
<div class="quarto-title">
|
||
<h1 class="title">Dataset Formats</h1>
|
||
</div>
|
||
|
||
<div>
|
||
<div class="description">
|
||
Guide to Dataset Formats in Axolotl
|
||
</div>
|
||
</div>
|
||
|
||
|
||
<div class="quarto-title-meta">
|
||
|
||
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</header>
|
||
|
||
|
||
<p>Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.</p>
|
||
<p>As there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.</p>
|
||
<p>Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.</p>
|
||
<section id="pre-training" class="level2">
|
||
<h2 class="anchored" data-anchor-id="pre-training"><a href="../../docs/dataset-formats/pretraining.html">Pre-training</a></h2>
|
||
<p>When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports <a href="https://huggingface.co/docs/datasets/en/stream">streaming</a> to only load batches into memory at a time.</p>
|
||
<p>A sample format for a pre-training dataset is as follows:</p>
|
||
<div class="sourceCode" id="cb1"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"text"</span><span class="fu">:</span> <span class="st">"first row"</span><span class="fu">}</span></span>
|
||
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"text"</span><span class="fu">:</span> <span class="st">"second row"</span><span class="fu">}</span></span>
|
||
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="er">...</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>It is typically recommended to save your dataset as <code>.jsonl</code> due to its flexibility and simplicity.</p>
|
||
<p>Axolotl supports loading from a Hugging Face hub repo or from local files.</p>
|
||
<div class="callout callout-style-default callout-important callout-titled">
|
||
<div class="callout-header d-flex align-content-center">
|
||
<div class="callout-icon-container">
|
||
<i class="callout-icon"></i>
|
||
</div>
|
||
<div class="callout-title-container flex-fill">
|
||
Important
|
||
</div>
|
||
</div>
|
||
<div class="callout-body-container callout-body">
|
||
<p>For pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.</p>
|
||
</div>
|
||
</div>
|
||
<section id="pre-training-from-hugging-face-hub-datasets" class="level3">
|
||
<h3 class="anchored" data-anchor-id="pre-training-from-hugging-face-hub-datasets">Pre-training from Hugging Face hub datasets</h3>
|
||
<p>As an example, to train using a Hugging Face dataset <code>hf_org/name</code>, you can pass the following config:</p>
|
||
<div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span><span class="at"> hf_org/name</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="pre-training-from-local-dataset-files" class="level3">
|
||
<h3 class="anchored" data-anchor-id="pre-training-from-local-dataset-files">Pre-training from local dataset files</h3>
|
||
<p>Given a few corpus files: <code>A.jsonl</code>, <code>B.jsonl</code>, and <code>C.jsonl</code>, your config will look like the below:</p>
|
||
<div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span></span>
|
||
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> json</span></span>
|
||
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">data_files</span><span class="kw">:</span></span>
|
||
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> A.jsonl</span></span>
|
||
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> B.jsonl</span></span>
|
||
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> C.jsonl</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>While we recommend <code>.jsonl</code>, you can also use the other formats (<code>csv</code>, <code>parquet</code>, <code>arrow</code>, <code>SQL</code>, <code>Webdataset</code>) that are supported by <a href="https://huggingface.co/docs/datasets/loading#local-and-remote-files"><code>Dataset.load_dataset</code></a></p>
|
||
</section>
|
||
<section id="pre-training-without-streaming" class="level3">
|
||
<h3 class="anchored" data-anchor-id="pre-training-without-streaming">Pre-training without streaming</h3>
|
||
<p>On the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the <code>completion</code> format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.</p>
|
||
<p>One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.</p>
|
||
<p>From Hugging Face:</p>
|
||
<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> hf_org/name</span></span>
|
||
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>From local files (either example works):</p>
|
||
<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span>
|
||
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> json</span></span>
|
||
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">data_files</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"A.jsonl"</span><span class="kw">,</span><span class="at"> </span><span class="st">"B.jsonl"</span><span class="kw">,</span><span class="at"> </span><span class="st">"C.jsonl"</span><span class="kw">]</span></span>
|
||
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="pre-training-dataset-configuration-tips" class="level3">
|
||
<h3 class="anchored" data-anchor-id="pre-training-dataset-configuration-tips">Pre-training dataset configuration tips</h3>
|
||
<section id="setting-max_steps" class="level4">
|
||
<h4 class="anchored" data-anchor-id="setting-max_steps">Setting max_steps</h4>
|
||
<p>When using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.</p>
|
||
<p>Therefore, it is necessary to set <code>max_steps: int</code> in your config for pre-training to run, so that Axolotl knows when to stop training.</p>
|
||
<p>One step is equal to <code>sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus</code> tokens.</p>
|
||
</section>
|
||
<section id="group_by_length" class="level4">
|
||
<h4 class="anchored" data-anchor-id="group_by_length">Group_by_length</h4>
|
||
<p>It is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.</p>
|
||
</section>
|
||
</section>
|
||
</section>
|
||
<section id="supervised-fine-tuning-sft" class="level2">
|
||
<h2 class="anchored" data-anchor-id="supervised-fine-tuning-sft">Supervised fine-tuning (SFT)</h2>
|
||
<p>Supervised fine-tuning is the process of training models to respond to an instruction or chat input.</p>
|
||
<p>As there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.</p>
|
||
<p>Axolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.</p>
|
||
<p>A flow chart is as follows:</p>
|
||
<ol type="1">
|
||
<li><p>Do you already have the dataset tokenized? If yes, check <a href="#pre-tokenized-dataset">Pre-Tokenized Dataset</a>.</p></li>
|
||
<li><p>Do you want to format the dataset yourself and manually choose each section to mask? If yes, check <a href="#template-free-dataset">Template Free Dataset</a></p></li>
|
||
<li><p>Is your dataset in a “conversation” format, containing a <code>list[messages]</code>? If yes, check <a href="#conversation-dataset">Conversation Dataset</a></p></li>
|
||
<li><p>Is your dataset in an “instruct” format, containing <code>{ instruction, response }</code>? If yes, check <a href="#instruction-dataset">Instruction Dataset</a></p></li>
|
||
</ol>
|
||
<p>If you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a Github Discussion.</p>
|
||
<div class="callout callout-style-default callout-tip callout-titled">
|
||
<div class="callout-header d-flex align-content-center">
|
||
<div class="callout-icon-container">
|
||
<i class="callout-icon"></i>
|
||
</div>
|
||
<div class="callout-title-container flex-fill">
|
||
Tip
|
||
</div>
|
||
</div>
|
||
<div class="callout-body-container callout-body">
|
||
<p>You can mix and match within each approach or across approaches to train a model on a variety of datasets.</p>
|
||
</div>
|
||
</div>
|
||
<section id="pre-tokenized-dataset" class="level3">
|
||
<h3 class="anchored" data-anchor-id="pre-tokenized-dataset"><a href="../../docs/dataset-formats/tokenized.html">Pre-Tokenized Dataset</a></h3>
|
||
<p>We suggest this approach when you want to bring your own tokenized dataset.</p>
|
||
<p>Axolotl expects the dataset to have three keys: - <code>input_ids</code>: from tokenizing formatted prompt - <code>attention_mask</code>: for masking padding. If you don’t add padding, it would be equal to <code>len(input_ids) * [1]</code> - <code>labels</code>: this is the same as <code>input_ids</code>, however, if you want to mask certain tokens, you would set those indices to <code>-100</code>.</p>
|
||
<div class="callout callout-style-default callout-tip callout-titled">
|
||
<div class="callout-header d-flex align-content-center">
|
||
<div class="callout-icon-container">
|
||
<i class="callout-icon"></i>
|
||
</div>
|
||
<div class="callout-title-container flex-fill">
|
||
Tip
|
||
</div>
|
||
</div>
|
||
<div class="callout-body-container callout-body">
|
||
<p>Make sure to add BOS/EOS tokens to your prompt and mask it appropriately.</p>
|
||
</div>
|
||
</div>
|
||
<p>A config for this would look like:</p>
|
||
<div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<div class="callout callout-style-default callout-note callout-titled">
|
||
<div class="callout-header d-flex align-content-center">
|
||
<div class="callout-icon-container">
|
||
<i class="callout-icon"></i>
|
||
</div>
|
||
<div class="callout-title-container flex-fill">
|
||
Note
|
||
</div>
|
||
</div>
|
||
<div class="callout-body-container callout-body">
|
||
<p><code>type:</code> is empty!</p>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
<section id="template-free-dataset" class="level3">
|
||
<h3 class="anchored" data-anchor-id="template-free-dataset"><a href="../../docs/dataset-formats/template_free.html">Template Free Dataset</a></h3>
|
||
<p>We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.</p>
|
||
<p>In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.</p>
|
||
<div class="sourceCode" id="cb7"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"segments"</span><span class="fu">:</span> <span class="ot">[</span></span>
|
||
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"label"</span><span class="fu">:</span> <span class="kw">true</span><span class="fu">,</span></span>
|
||
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"<s>Hello</span><span class="ch">\n</span><span class="st">"</span></span>
|
||
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
|
||
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">"label"</span><span class="fu">:</span> <span class="kw">true</span><span class="fu">,</span></span>
|
||
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"hi there!. "</span></span>
|
||
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
|
||
<span id="cb7-11"><a href="#cb7-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb7-12"><a href="#cb7-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">"label"</span><span class="fu">:</span> <span class="kw">false</span><span class="fu">,</span></span>
|
||
<span id="cb7-13"><a href="#cb7-13" aria-hidden="true" tabindex="-1"></a> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"goodbye "</span></span>
|
||
<span id="cb7-14"><a href="#cb7-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
|
||
<span id="cb7-15"><a href="#cb7-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb7-16"><a href="#cb7-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">"label"</span><span class="fu">:</span> <span class="kw">true</span><span class="fu">,</span></span>
|
||
<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"farewell</s>"</span></span>
|
||
<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
|
||
<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
|
||
<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>Each prompt must be have a key called <code>segments</code> which is a list of <code>{ text, label }</code>.</p>
|
||
<div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> input_output</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="conversation-dataset" class="level3">
|
||
<h3 class="anchored" data-anchor-id="conversation-dataset"><a href="../../docs/dataset-formats/conversation.html">Conversation Dataset</a></h3>
|
||
<p><code>conversation</code> messages are a list of messages which usually contain a <code>role</code> and <code>content</code> key.</p>
|
||
<div class="callout callout-style-default callout-tip callout-titled">
|
||
<div class="callout-header d-flex align-content-center">
|
||
<div class="callout-icon-container">
|
||
<i class="callout-icon"></i>
|
||
</div>
|
||
<div class="callout-title-container flex-fill">
|
||
Tip
|
||
</div>
|
||
</div>
|
||
<div class="callout-body-container callout-body">
|
||
<p>Fun fact: Axolotl synonymously refers to “chat” messages as <code>conversation</code> messages due to how FastChat initially used this term to build a widely used <a href="https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py">fastchat conversation</a> method for formatting chat messages prior to the creation of <code>chat_templates</code>.</p>
|
||
</div>
|
||
</div>
|
||
<section id="what-are-chat_templates" class="level4">
|
||
<h4 class="anchored" data-anchor-id="what-are-chat_templates">What are <code>chat_templates</code>?</h4>
|
||
<p>The current most popular and convenient method for inference is to use <code>chat_templates</code> for formatting prompts. Axolotl supports using <code>chat_templates</code> for training to ensure that the model performs in the same environment as in inference.</p>
|
||
<p>Here’s a quick rundown on <code>chat_template</code>: A <code>chat_template</code> is a Jinja2 template which formats a list of messages into a prompt.</p>
|
||
<p>An example of a prompt formatted into a popular template called ChatML can be seen below:</p>
|
||
<p>Single prompt (pretty-printed):</p>
|
||
<div class="sourceCode" id="cb9"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span></span>
|
||
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span></span>
|
||
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"Hi"</span></span>
|
||
<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
|
||
<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span></span>
|
||
<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"How can I help you?"</span></span>
|
||
<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
|
||
<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span></span>
|
||
<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"Can you add 3+5?"</span></span>
|
||
<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span><span class="ot">,</span></span>
|
||
<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||
<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"assistant"</span><span class="fu">,</span></span>
|
||
<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"The answer is 8."</span></span>
|
||
<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
|
||
<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
|
||
<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>The ChatML template is as follows:</p>
|
||
<pre class="jinja2"><code>{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}</code></pre>
|
||
<p>The above prompt formatted into this template will result in:</p>
|
||
<pre><code><|im_start|>user
|
||
Hi<|im_end|>
|
||
<|im_start|>assistant
|
||
How can I help you?<|im_end|>
|
||
<|im_start|>user
|
||
Can you add 3+5?<|im_end|>
|
||
<|im_start|>assistant
|
||
The answer is 8.<|im_end|></code></pre>
|
||
<p>By using delimiters (<code><|im_start|></code> and <code><|im_end|></code>), a prompt separates different speakers which helps the model identify which portion belongs to whom.</p>
|
||
</section>
|
||
<section id="common-conversation-dataset-formats" class="level4">
|
||
<h4 class="anchored" data-anchor-id="common-conversation-dataset-formats">Common Conversation Dataset formats</h4>
|
||
<p>Older conversation datasets with the following format are colloquially called <code>sharegpt</code> datasets.</p>
|
||
<div class="sourceCode" id="cb12"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"conversations"</span><span class="fu">:</span> <span class="ot">[</span><span class="fu">{</span><span class="dt">"from"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"value"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">]</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>Newer conversation datasets usually follow the OpenAI format.</p>
|
||
<div class="sourceCode" id="cb13"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span><span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">]</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>Axolotl supports both as well as allowing customization of any kind of key.</p>
|
||
</section>
|
||
<section id="chat-template-usage" class="level4">
|
||
<h4 class="anchored" data-anchor-id="chat-template-usage"><a href="../../docs/dataset-formats/conversation.html#chat_template">Chat Template Usage</a></h4>
|
||
<p>To properly use this method, it is important to identify three things:</p>
|
||
<ol type="1">
|
||
<li><p>Which <code>chat_template</code> would you use?</p></li>
|
||
<li><p>What are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be <code>messages</code>, <code>role</code>, and <code>content</code>, respectively, whereas the possible roles are <code>system</code>, <code>user</code>, and <code>assistant</code>.</p></li>
|
||
<li><p>What do you want to mask? For instance, only assistant messages, only last message, or nothing.</p></li>
|
||
</ol>
|
||
<section id="choosing-a-chat_template" class="level5">
|
||
<h5 class="anchored" data-anchor-id="choosing-a-chat_template">Choosing a <code>chat_template</code></h5>
|
||
<p>There are a lot of <code>chat_templates</code> out there. Axolotl supports the common ones: <a href="https://github.com/axolotl-ai-cloud/axolotl/blob/860609392184cf62a7e0ca676658b170e059ce6c/src/axolotl/utils/chat_templates.py#L17">supported chat templates</a>. For example, to use ChatML, it would be <code>chat_template: chatml</code>.</p>
|
||
<p>However, it is also possible to use the already configured template within the tokenizer by specifying <code>chat_template: tokenizer_default</code>. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do <code>chat_template: tokenizer_default_fallback_chatml</code> to fallback to the ChatML template if a tokenizer template was not found.</p>
|
||
<p>One last but powerful approach is to bring your own template. This can be set via:</p>
|
||
<div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="co"> # your template</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
<section id="setting-chat_template-dataset-keys" class="level5">
|
||
<h5 class="anchored" data-anchor-id="setting-chat_template-dataset-keys">Setting <code>chat_template</code> dataset keys</h5>
|
||
<p>We currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.</p>
|
||
<p>If your dataset format is different, here are the keys you should check (with their defaults):</p>
|
||
<div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> messages</span></span>
|
||
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> role</span></span>
|
||
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> content</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>In some <code>chat_templates</code> (e.g. <a href="https://huggingface.co/google/gemma-2b-it/blob/main/tokenizer_config.json#L1507">Gemma</a>), the roles are hardcoded to <code>user</code> and <code>assistant</code>. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a <code>KeyError</code>, it would be necessary to add mapping for your roles. Here is an example of how it would look like:</p>
|
||
<div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles</span><span class="kw">:</span></span>
|
||
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">assistant</span><span class="kw">:</span></span>
|
||
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> gpt</span></span>
|
||
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> model</span></span>
|
||
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">user</span><span class="kw">:</span></span>
|
||
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> human</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>In the example above, all <code>gpt</code> and <code>model</code> values are converted to <code>assistant</code>. All <code>human</code> values are converted to <code>user.</code></p>
|
||
</section>
|
||
<section id="handling-masking" class="level5">
|
||
<h5 class="anchored" data-anchor-id="handling-masking">Handling masking</h5>
|
||
<p>The common use case for <code>chat_template</code> is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.</p>
|
||
<p>To train on all <code>assistant</code> messages, you would set the following configs.</p>
|
||
<div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"assistant"</span><span class="kw">]</span></span>
|
||
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> </span><span class="st">"turn"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>The <code>train_on_eos</code> config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: <code>all</code> and <code>last</code> to choose which EOS to train on.</p>
|
||
<p>Perhaps, you want to train on <code>assistant</code> and <code>narrator</code> roles, you can simply add <code>narrator</code> to the list of <code>roles_to_train</code>. You would also need to add it to the mapping of <code>roles</code> above.</p>
|
||
<div class="sourceCode" id="cb18"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"assistant"</span><span class="kw">,</span><span class="at"> </span><span class="st">"narrator"</span><span class="kw">]</span></span>
|
||
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles</span><span class="kw">:</span></span>
|
||
<span id="cb18-5"><a href="#cb18-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">assistant</span><span class="kw">:</span></span>
|
||
<span id="cb18-6"><a href="#cb18-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> gpt</span></span>
|
||
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> model</span></span>
|
||
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">user</span><span class="kw">:</span></span>
|
||
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> human</span></span>
|
||
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">narrator</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"narrator"</span><span class="kw">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
</section>
|
||
</section>
|
||
<section id="applying-chat_template" class="level4">
|
||
<h4 class="anchored" data-anchor-id="applying-chat_template">Applying <code>chat_template</code></h4>
|
||
<p>Once all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset. The final step would be to correctly set the EOS token in your config:</p>
|
||
<div class="sourceCode" id="cb19"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||
<span id="cb19-3"><a href="#cb19-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chat_template</span></span>
|
||
<span id="cb19-4"><a href="#cb19-4" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb19-5"><a href="#cb19-5" aria-hidden="true" tabindex="-1"></a><span class="co"> # step 1</span></span>
|
||
<span id="cb19-6"><a href="#cb19-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> chatml</span></span>
|
||
<span id="cb19-7"><a href="#cb19-7" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb19-8"><a href="#cb19-8" aria-hidden="true" tabindex="-1"></a><span class="co"> # step 2</span></span>
|
||
<span id="cb19-9"><a href="#cb19-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> messages</span></span>
|
||
<span id="cb19-10"><a href="#cb19-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_role</span><span class="kw">:</span><span class="at"> role</span></span>
|
||
<span id="cb19-11"><a href="#cb19-11" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_field_content</span><span class="kw">:</span><span class="at"> content</span></span>
|
||
<span id="cb19-12"><a href="#cb19-12" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb19-13"><a href="#cb19-13" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles</span><span class="kw">:</span></span>
|
||
<span id="cb19-14"><a href="#cb19-14" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">assistant</span><span class="kw">:</span></span>
|
||
<span id="cb19-15"><a href="#cb19-15" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> gpt</span></span>
|
||
<span id="cb19-16"><a href="#cb19-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> model</span></span>
|
||
<span id="cb19-17"><a href="#cb19-17" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> assistant</span></span>
|
||
<span id="cb19-18"><a href="#cb19-18" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">user</span><span class="kw">:</span></span>
|
||
<span id="cb19-19"><a href="#cb19-19" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> human</span></span>
|
||
<span id="cb19-20"><a href="#cb19-20" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> user</span></span>
|
||
<span id="cb19-21"><a href="#cb19-21" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb19-22"><a href="#cb19-22" aria-hidden="true" tabindex="-1"></a><span class="co"> # step 3</span></span>
|
||
<span id="cb19-23"><a href="#cb19-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"assistant"</span><span class="kw">]</span></span>
|
||
<span id="cb19-24"><a href="#cb19-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> </span><span class="st">"turn"</span></span>
|
||
<span id="cb19-25"><a href="#cb19-25" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb19-26"><a href="#cb19-26" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span></span>
|
||
<span id="cb19-27"><a href="#cb19-27" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> <|im_end|></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>If this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via <code>axolotl preprocess config.yaml --debug</code>):</p>
|
||
<pre><code><|im_start|>(-100, 128256) user(-100, 882)
|
||
(-100, 198) Hi(-100, 13347) <|im_end|>(-100, 128257)
|
||
(-100, 198) <|im_start|>(-100, 128256) assistant(-100, 78191)
|
||
(-100, 198) How(4438, 4438) can(649, 649) I(358, 358) help(1520, 1520) you(499, 499) ?(30, 30) <|im_end|>(128257, 128257)
|
||
(-100, 198) <|im_start|>(-100, 128256) user(-100, 882)
|
||
(-100, 198) Can(-100, 6854) you(-100, 499) add(-100, 923) (-100, 220) 3(-100, 18) +(-100, 10) 5(-100, 20) ?(-100, 30) <|im_end|>(-100, 128257)
|
||
(-100, 198) <|im_start|>(-100, 128256) assistant(-100, 78191)
|
||
(-100, 198) The(791, 791) answer(4320, 4320) is(374, 374) (220, 220) 8(23, 23) .(13, 13) <|im_end|>(128257, 128257)
|
||
(-100, 198)</code></pre>
|
||
<p>The first number refers to the label, the second refers to the <code>token_id</code>. For example, <code>-100</code> labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the <code>token_id</code>.</p>
|
||
</section>
|
||
</section>
|
||
<section id="instruction-dataset" class="level3">
|
||
<h3 class="anchored" data-anchor-id="instruction-dataset"><a href="../../docs/dataset-formats/inst_tune.html">Instruction Dataset</a></h3>
|
||
<p>Instruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.</p>
|
||
<p>An example is of a common format called Alpaca:</p>
|
||
<div class="sourceCode" id="cb21"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb21-1"><a href="#cb21-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"instruction"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"output"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>Using those keys, a prompt can be built based on it.</p>
|
||
<pre><code>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
||
|
||
### Instruction:
|
||
{instruction}
|
||
|
||
### Input:
|
||
{input}
|
||
|
||
### Response:
|
||
{output}</code></pre>
|
||
<p>This can be configured as such:</p>
|
||
<div class="sourceCode" id="cb23"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb23-1"><a href="#cb23-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb23-2"><a href="#cb23-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||
<span id="cb23-3"><a href="#cb23-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> alpaca</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.</p>
|
||
<section id="custom-instruct-prompt-format" class="level4">
|
||
<h4 class="anchored" data-anchor-id="custom-instruct-prompt-format">Custom Instruct Prompt Format</h4>
|
||
<p>Due to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.</p>
|
||
<p>In the example below, a sample row is used to output in <code>mistral_v1</code> format.</p>
|
||
<div class="sourceCode" id="cb24"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"output"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<div class="sourceCode" id="cb25"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||
<span id="cb25-2"><a href="#cb25-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> repo</span></span>
|
||
<span id="cb25-3"><a href="#cb25-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span></span>
|
||
<span id="cb25-4"><a href="#cb25-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> </span><span class="st">""</span></span>
|
||
<span id="cb25-5"><a href="#cb25-5" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb25-6"><a href="#cb25-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_system</span><span class="kw">:</span></span>
|
||
<span id="cb25-7"><a href="#cb25-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_instruction</span><span class="kw">:</span><span class="at"> input</span></span>
|
||
<span id="cb25-8"><a href="#cb25-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_input</span><span class="kw">:</span></span>
|
||
<span id="cb25-9"><a href="#cb25-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_output</span><span class="kw">:</span><span class="at"> output</span></span>
|
||
<span id="cb25-10"><a href="#cb25-10" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb25-11"><a href="#cb25-11" aria-hidden="true" tabindex="-1"></a><span class="co"> # multi-line example with input</span></span>
|
||
<span id="cb25-12"><a href="#cb25-12" aria-hidden="true" tabindex="-1"></a><span class="fu"> format</span><span class="kw">: </span><span class="ch">|-</span></span>
|
||
<span id="cb25-13"><a href="#cb25-13" aria-hidden="true" tabindex="-1"></a> [INST] {instruction} {input} [/INST]</span>
|
||
<span id="cb25-14"><a href="#cb25-14" aria-hidden="true" tabindex="-1"></a></span>
|
||
<span id="cb25-15"><a href="#cb25-15" aria-hidden="true" tabindex="-1"></a><span class="co"> # single-line example without input</span></span>
|
||
<span id="cb25-16"><a href="#cb25-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"[INST] {instruction} [/INST]"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||
<p>The config sets that the <code>field_instruction</code> is actually named <code>input</code>, and the <code>field_input</code> is empty as we don’t have an <code>input</code> in this sample. Generally, <code>instruction</code> can be thought as the question to the model, and <code>input</code> as the additional information with <code>output</code> being the response. It is not necessary to have an <code>input</code> nor <code>system</code>. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.</p>
|
||
</section>
|
||
</section>
|
||
</section>
|
||
<section id="reinforcement-learning-from-human-feedback-rlhf" class="level2">
|
||
<h2 class="anchored" data-anchor-id="reinforcement-learning-from-human-feedback-rlhf">Reinforcement Learning from Human Feedback (RLHF)</h2>
|
||
<p>As there are multiple RLHF methods with their own dataset requirements. Please see <a href="../../docs/rlhf.html">RLHF datasets</a> documentation for more detail.</p>
|
||
|
||
|
||
</section>
|
||
|
||
</main> <!-- /main -->
|
||
<script id="quarto-html-after-body" type="application/javascript">
|
||
window.document.addEventListener("DOMContentLoaded", function (event) {
|
||
const toggleBodyColorMode = (bsSheetEl) => {
|
||
const mode = bsSheetEl.getAttribute("data-mode");
|
||
const bodyEl = window.document.querySelector("body");
|
||
if (mode === "dark") {
|
||
bodyEl.classList.add("quarto-dark");
|
||
bodyEl.classList.remove("quarto-light");
|
||
} else {
|
||
bodyEl.classList.add("quarto-light");
|
||
bodyEl.classList.remove("quarto-dark");
|
||
}
|
||
}
|
||
const toggleBodyColorPrimary = () => {
|
||
const bsSheetEl = window.document.querySelector("link#quarto-bootstrap");
|
||
if (bsSheetEl) {
|
||
toggleBodyColorMode(bsSheetEl);
|
||
}
|
||
}
|
||
toggleBodyColorPrimary();
|
||
const icon = "";
|
||
const anchorJS = new window.AnchorJS();
|
||
anchorJS.options = {
|
||
placement: 'right',
|
||
icon: icon
|
||
};
|
||
anchorJS.add('.anchored');
|
||
const isCodeAnnotation = (el) => {
|
||
for (const clz of el.classList) {
|
||
if (clz.startsWith('code-annotation-')) {
|
||
return true;
|
||
}
|
||
}
|
||
return false;
|
||
}
|
||
const onCopySuccess = function(e) {
|
||
// button target
|
||
const button = e.trigger;
|
||
// don't keep focus
|
||
button.blur();
|
||
// flash "checked"
|
||
button.classList.add('code-copy-button-checked');
|
||
var currentTitle = button.getAttribute("title");
|
||
button.setAttribute("title", "Copied!");
|
||
let tooltip;
|
||
if (window.bootstrap) {
|
||
button.setAttribute("data-bs-toggle", "tooltip");
|
||
button.setAttribute("data-bs-placement", "left");
|
||
button.setAttribute("data-bs-title", "Copied!");
|
||
tooltip = new bootstrap.Tooltip(button,
|
||
{ trigger: "manual",
|
||
customClass: "code-copy-button-tooltip",
|
||
offset: [0, -8]});
|
||
tooltip.show();
|
||
}
|
||
setTimeout(function() {
|
||
if (tooltip) {
|
||
tooltip.hide();
|
||
button.removeAttribute("data-bs-title");
|
||
button.removeAttribute("data-bs-toggle");
|
||
button.removeAttribute("data-bs-placement");
|
||
}
|
||
button.setAttribute("title", currentTitle);
|
||
button.classList.remove('code-copy-button-checked');
|
||
}, 1000);
|
||
// clear code selection
|
||
e.clearSelection();
|
||
}
|
||
const getTextToCopy = function(trigger) {
|
||
const codeEl = trigger.previousElementSibling.cloneNode(true);
|
||
for (const childEl of codeEl.children) {
|
||
if (isCodeAnnotation(childEl)) {
|
||
childEl.remove();
|
||
}
|
||
}
|
||
return codeEl.innerText;
|
||
}
|
||
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
|
||
text: getTextToCopy
|
||
});
|
||
clipboard.on('success', onCopySuccess);
|
||
if (window.document.getElementById('quarto-embedded-source-code-modal')) {
|
||
const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
|
||
text: getTextToCopy,
|
||
container: window.document.getElementById('quarto-embedded-source-code-modal')
|
||
});
|
||
clipboardModal.on('success', onCopySuccess);
|
||
}
|
||
var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
|
||
var mailtoRegex = new RegExp(/^mailto:/);
|
||
var filterRegex = new RegExp("https:\/\/axolotl-ai-cloud\.github\.io\/axolotl\/");
|
||
var isInternal = (href) => {
|
||
return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
|
||
}
|
||
// Inspect non-navigation links and adorn them if external
|
||
var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
|
||
for (var i=0; i<links.length; i++) {
|
||
const link = links[i];
|
||
if (!isInternal(link.href)) {
|
||
// undo the damage that might have been done by quarto-nav.js in the case of
|
||
// links that we want to consider external
|
||
if (link.dataset.originalHref !== undefined) {
|
||
link.href = link.dataset.originalHref;
|
||
}
|
||
}
|
||
}
|
||
function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
|
||
const config = {
|
||
allowHTML: true,
|
||
maxWidth: 500,
|
||
delay: 100,
|
||
arrow: false,
|
||
appendTo: function(el) {
|
||
return el.parentElement;
|
||
},
|
||
interactive: true,
|
||
interactiveBorder: 10,
|
||
theme: 'quarto',
|
||
placement: 'bottom-start',
|
||
};
|
||
if (contentFn) {
|
||
config.content = contentFn;
|
||
}
|
||
if (onTriggerFn) {
|
||
config.onTrigger = onTriggerFn;
|
||
}
|
||
if (onUntriggerFn) {
|
||
config.onUntrigger = onUntriggerFn;
|
||
}
|
||
window.tippy(el, config);
|
||
}
|
||
const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
|
||
for (var i=0; i<noterefs.length; i++) {
|
||
const ref = noterefs[i];
|
||
tippyHover(ref, function() {
|
||
// use id or data attribute instead here
|
||
let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
|
||
try { href = new URL(href).hash; } catch {}
|
||
const id = href.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note) {
|
||
return note.innerHTML;
|
||
} else {
|
||
return "";
|
||
}
|
||
});
|
||
}
|
||
const xrefs = window.document.querySelectorAll('a.quarto-xref');
|
||
const processXRef = (id, note) => {
|
||
// Strip column container classes
|
||
const stripColumnClz = (el) => {
|
||
el.classList.remove("page-full", "page-columns");
|
||
if (el.children) {
|
||
for (const child of el.children) {
|
||
stripColumnClz(child);
|
||
}
|
||
}
|
||
}
|
||
stripColumnClz(note)
|
||
if (id === null || id.startsWith('sec-')) {
|
||
// Special case sections, only their first couple elements
|
||
const container = document.createElement("div");
|
||
if (note.children && note.children.length > 2) {
|
||
container.appendChild(note.children[0].cloneNode(true));
|
||
for (let i = 1; i < note.children.length; i++) {
|
||
const child = note.children[i];
|
||
if (child.tagName === "P" && child.innerText === "") {
|
||
continue;
|
||
} else {
|
||
container.appendChild(child.cloneNode(true));
|
||
break;
|
||
}
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(container);
|
||
}
|
||
return container.innerHTML
|
||
} else {
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
return note.innerHTML;
|
||
}
|
||
} else {
|
||
// Remove any anchor links if they are present
|
||
const anchorLink = note.querySelector('a.anchorjs-link');
|
||
if (anchorLink) {
|
||
anchorLink.remove();
|
||
}
|
||
if (window.Quarto?.typesetMath) {
|
||
window.Quarto.typesetMath(note);
|
||
}
|
||
if (note.classList.contains("callout")) {
|
||
return note.outerHTML;
|
||
} else {
|
||
return note.innerHTML;
|
||
}
|
||
}
|
||
}
|
||
for (var i=0; i<xrefs.length; i++) {
|
||
const xref = xrefs[i];
|
||
tippyHover(xref, undefined, function(instance) {
|
||
instance.disable();
|
||
let url = xref.getAttribute('href');
|
||
let hash = undefined;
|
||
if (url.startsWith('#')) {
|
||
hash = url;
|
||
} else {
|
||
try { hash = new URL(url).hash; } catch {}
|
||
}
|
||
if (hash) {
|
||
const id = hash.replace(/^#\/?/, "");
|
||
const note = window.document.getElementById(id);
|
||
if (note !== null) {
|
||
try {
|
||
const html = processXRef(id, note.cloneNode(true));
|
||
instance.setContent(html);
|
||
} finally {
|
||
instance.enable();
|
||
instance.show();
|
||
}
|
||
} else {
|
||
// See if we can fetch this
|
||
fetch(url.split('#')[0])
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.getElementById(id);
|
||
if (note !== null) {
|
||
const html = processXRef(id, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
} else {
|
||
// See if we can fetch a full url (with no hash to target)
|
||
// This is a special case and we should probably do some content thinning / targeting
|
||
fetch(url)
|
||
.then(res => res.text())
|
||
.then(html => {
|
||
const parser = new DOMParser();
|
||
const htmlDoc = parser.parseFromString(html, "text/html");
|
||
const note = htmlDoc.querySelector('main.content');
|
||
if (note !== null) {
|
||
// This should only happen for chapter cross references
|
||
// (since there is no id in the URL)
|
||
// remove the first header
|
||
if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
|
||
note.children[0].remove();
|
||
}
|
||
const html = processXRef(null, note);
|
||
instance.setContent(html);
|
||
}
|
||
}).finally(() => {
|
||
instance.enable();
|
||
instance.show();
|
||
});
|
||
}
|
||
}, function(instance) {
|
||
});
|
||
}
|
||
let selectedAnnoteEl;
|
||
const selectorForAnnotation = ( cell, annotation) => {
|
||
let cellAttr = 'data-code-cell="' + cell + '"';
|
||
let lineAttr = 'data-code-annotation="' + annotation + '"';
|
||
const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
|
||
return selector;
|
||
}
|
||
const selectCodeLines = (annoteEl) => {
|
||
const doc = window.document;
|
||
const targetCell = annoteEl.getAttribute("data-target-cell");
|
||
const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
|
||
const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
|
||
const lines = annoteSpan.getAttribute("data-code-lines").split(",");
|
||
const lineIds = lines.map((line) => {
|
||
return targetCell + "-" + line;
|
||
})
|
||
let top = null;
|
||
let height = null;
|
||
let parent = null;
|
||
if (lineIds.length > 0) {
|
||
//compute the position of the single el (top and bottom and make a div)
|
||
const el = window.document.getElementById(lineIds[0]);
|
||
top = el.offsetTop;
|
||
height = el.offsetHeight;
|
||
parent = el.parentElement.parentElement;
|
||
if (lineIds.length > 1) {
|
||
const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
|
||
const bottom = lastEl.offsetTop + lastEl.offsetHeight;
|
||
height = bottom - top;
|
||
}
|
||
if (top !== null && height !== null && parent !== null) {
|
||
// cook up a div (if necessary) and position it
|
||
let div = window.document.getElementById("code-annotation-line-highlight");
|
||
if (div === null) {
|
||
div = window.document.createElement("div");
|
||
div.setAttribute("id", "code-annotation-line-highlight");
|
||
div.style.position = 'absolute';
|
||
parent.appendChild(div);
|
||
}
|
||
div.style.top = top - 2 + "px";
|
||
div.style.height = height + 4 + "px";
|
||
div.style.left = 0;
|
||
let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
|
||
if (gutterDiv === null) {
|
||
gutterDiv = window.document.createElement("div");
|
||
gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
|
||
gutterDiv.style.position = 'absolute';
|
||
const codeCell = window.document.getElementById(targetCell);
|
||
const gutter = codeCell.querySelector('.code-annotation-gutter');
|
||
gutter.appendChild(gutterDiv);
|
||
}
|
||
gutterDiv.style.top = top - 2 + "px";
|
||
gutterDiv.style.height = height + 4 + "px";
|
||
}
|
||
selectedAnnoteEl = annoteEl;
|
||
}
|
||
};
|
||
const unselectCodeLines = () => {
|
||
const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
|
||
elementsIds.forEach((elId) => {
|
||
const div = window.document.getElementById(elId);
|
||
if (div) {
|
||
div.remove();
|
||
}
|
||
});
|
||
selectedAnnoteEl = undefined;
|
||
};
|
||
// Handle positioning of the toggle
|
||
window.addEventListener(
|
||
"resize",
|
||
throttle(() => {
|
||
elRect = undefined;
|
||
if (selectedAnnoteEl) {
|
||
selectCodeLines(selectedAnnoteEl);
|
||
}
|
||
}, 10)
|
||
);
|
||
function throttle(fn, ms) {
|
||
let throttle = false;
|
||
let timer;
|
||
return (...args) => {
|
||
if(!throttle) { // first call gets through
|
||
fn.apply(this, args);
|
||
throttle = true;
|
||
} else { // all the others get throttled
|
||
if(timer) clearTimeout(timer); // cancel #2
|
||
timer = setTimeout(() => {
|
||
fn.apply(this, args);
|
||
timer = throttle = false;
|
||
}, ms);
|
||
}
|
||
};
|
||
}
|
||
// Attach click handler to the DT
|
||
const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
|
||
for (const annoteDlNode of annoteDls) {
|
||
annoteDlNode.addEventListener('click', (event) => {
|
||
const clickedEl = event.target;
|
||
if (clickedEl !== selectedAnnoteEl) {
|
||
unselectCodeLines();
|
||
const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
|
||
if (activeEl) {
|
||
activeEl.classList.remove('code-annotation-active');
|
||
}
|
||
selectCodeLines(clickedEl);
|
||
clickedEl.classList.add('code-annotation-active');
|
||
} else {
|
||
// Unselect the line
|
||
unselectCodeLines();
|
||
clickedEl.classList.remove('code-annotation-active');
|
||
}
|
||
});
|
||
}
|
||
const findCites = (el) => {
|
||
const parentEl = el.parentElement;
|
||
if (parentEl) {
|
||
const cites = parentEl.dataset.cites;
|
||
if (cites) {
|
||
return {
|
||
el,
|
||
cites: cites.split(' ')
|
||
};
|
||
} else {
|
||
return findCites(el.parentElement)
|
||
}
|
||
} else {
|
||
return undefined;
|
||
}
|
||
};
|
||
var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
|
||
for (var i=0; i<bibliorefs.length; i++) {
|
||
const ref = bibliorefs[i];
|
||
const citeInfo = findCites(ref);
|
||
if (citeInfo) {
|
||
tippyHover(citeInfo.el, function() {
|
||
var popup = window.document.createElement('div');
|
||
citeInfo.cites.forEach(function(cite) {
|
||
var citeDiv = window.document.createElement('div');
|
||
citeDiv.classList.add('hanging-indent');
|
||
citeDiv.classList.add('csl-entry');
|
||
var biblioDiv = window.document.getElementById('ref-' + cite);
|
||
if (biblioDiv) {
|
||
citeDiv.innerHTML = biblioDiv.innerHTML;
|
||
}
|
||
popup.appendChild(citeDiv);
|
||
});
|
||
return popup.innerHTML;
|
||
});
|
||
}
|
||
}
|
||
});
|
||
</script>
|
||
</div> <!-- /content -->
|
||
|
||
|
||
|
||
|
||
</body></html> |