Built site for gh-pages
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
|
||||
|
||||
<meta charset="utf-8">
|
||||
<meta name="generator" content="quarto-1.7.34">
|
||||
<meta name="generator" content="quarto-1.8.24">
|
||||
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
|
||||
|
||||
@@ -68,14 +68,15 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<link href="../../favicon.jpg" rel="icon" type="image/jpeg">
|
||||
<script src="../../site_libs/quarto-html/quarto.js" type="module"></script>
|
||||
<script src="../../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
|
||||
<script src="../../site_libs/quarto-html/axe/axe-check.js" type="module"></script>
|
||||
<script src="../../site_libs/quarto-html/popper.min.js"></script>
|
||||
<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
|
||||
<script src="../../site_libs/quarto-html/anchor.min.js"></script>
|
||||
<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
|
||||
<link href="../../site_libs/quarto-html/quarto-syntax-highlighting-dark-befe23ebd2f54d8af2c8a89d1a1611f1.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||||
<link href="../../site_libs/quarto-html/quarto-syntax-highlighting-dark-b651517ce65839d647a86e2780455cfb.css" rel="stylesheet" id="quarto-text-highlighting-styles">
|
||||
<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
|
||||
<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
|
||||
<link href="../../site_libs/bootstrap/bootstrap-e9895ec3143e9833a687747e8d39d226.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
|
||||
<link href="../../site_libs/bootstrap/bootstrap-f9d679a32da2b248d4ca48a0e58e089e.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
|
||||
<script id="quarto-search-options" type="application/json">{
|
||||
"location": "navbar",
|
||||
"copy-button": false,
|
||||
@@ -125,7 +126,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
|
||||
<div class="navbar-container container-fluid">
|
||||
<div class="navbar-brand-container mx-auto">
|
||||
<a href="../../index.html" class="navbar-brand navbar-brand-logo">
|
||||
<img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo">
|
||||
<img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo light-content">
|
||||
<img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo dark-content">
|
||||
</a>
|
||||
</div>
|
||||
<div class="quarto-navbar-tools tools-wide tools-end">
|
||||
@@ -151,6 +153,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
|
||||
<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
|
||||
<!-- sidebar -->
|
||||
<nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
|
||||
<div class="pt-lg-2 mt-2 text-left sidebar-header">
|
||||
<a href="../../index.html" class="sidebar-logo-link">
|
||||
</a>
|
||||
</div>
|
||||
<div class="sidebar-menu-container">
|
||||
<ul class="list-unstyled mt-1">
|
||||
<li class="sidebar-item">
|
||||
@@ -584,25 +590,25 @@ Tip
|
||||
<h2 class="anchored" data-anchor-id="pre-training">Pre-training</h2>
|
||||
<p>When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports <a href="https://huggingface.co/docs/datasets/en/stream">streaming</a> to only load batches into memory at a time.</p>
|
||||
<p>A sample format for a pre-training dataset is as follows:</p>
|
||||
<div class="sourceCode" id="cb1"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"text"</span><span class="fu">:</span> <span class="st">"first row"</span><span class="fu">}</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb1"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"text"</span><span class="fu">:</span> <span class="st">"first row"</span><span class="fu">}</span></span>
|
||||
<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"text"</span><span class="fu">:</span> <span class="st">"second row"</span><span class="fu">}</span></span>
|
||||
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="er">...</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="er">...</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>It is typically recommended to save your dataset as <code>.jsonl</code> due to its flexibility and simplicity.</p>
|
||||
<p>Axolotl supports loading from a Hugging Face hub repo or from local files.</p>
|
||||
<section id="pre-training-from-hugging-face-hub-datasets" class="level3">
|
||||
<h3 class="anchored" data-anchor-id="pre-training-from-hugging-face-hub-datasets">Pre-training from Hugging Face hub datasets</h3>
|
||||
<p>As an example, to train using a Hugging Face dataset <code>hf_org/name</code>, you can pass the following config:</p>
|
||||
<div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span><span class="at"> hf_org/name</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span><span class="at"> hf_org/name</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
</section>
|
||||
<section id="pre-training-from-local-dataset-files" class="level3">
|
||||
<h3 class="anchored" data-anchor-id="pre-training-from-local-dataset-files">Pre-training from local dataset files</h3>
|
||||
<p>Given a few corpus files: <code>A.jsonl</code>, <code>B.jsonl</code>, and <code>C.jsonl</code>, your config will look like the below:</p>
|
||||
<div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span></span>
|
||||
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> json</span></span>
|
||||
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">data_files</span><span class="kw">:</span></span>
|
||||
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> A.jsonl</span></span>
|
||||
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> B.jsonl</span></span>
|
||||
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> C.jsonl</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> C.jsonl</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>While we recommend <code>.jsonl</code>, you can also use the other formats (<code>csv</code>, <code>parquet</code>, <code>arrow</code>, <code>SQL</code>, <code>Webdataset</code>) that are supported by <a href="https://huggingface.co/docs/datasets/loading#local-and-remote-files"><code>Dataset.load_dataset</code></a></p>
|
||||
</section>
|
||||
<section id="pre-training-without-streaming" class="level3">
|
||||
@@ -610,16 +616,16 @@ Tip
|
||||
<p>On the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the <code>completion</code> format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.</p>
|
||||
<p>One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.</p>
|
||||
<p>From Hugging Face:</p>
|
||||
<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> hf_org/name</span></span>
|
||||
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>From local files:</p>
|
||||
<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||||
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span>
|
||||
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> B.jsonl</span></span>
|
||||
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> completion</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<div class="callout callout-style-default callout-important callout-titled">
|
||||
<div class="callout-header d-flex align-content-center">
|
||||
<div class="callout-icon-container">
|
||||
@@ -701,9 +707,9 @@ Tip
|
||||
</div>
|
||||
</div>
|
||||
<p>A config for this would look like:</p>
|
||||
<div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb6"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||||
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<div class="callout callout-style-default callout-note callout-titled">
|
||||
<div class="callout-header d-flex align-content-center">
|
||||
<div class="callout-icon-container">
|
||||
@@ -723,7 +729,7 @@ Note
|
||||
<h3 class="anchored" data-anchor-id="template-free-dataset">Template Free Dataset</h3>
|
||||
<p>We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.</p>
|
||||
<p>In the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.</p>
|
||||
<div class="sourceCode" id="cb7"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb7"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||||
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"segments"</span><span class="fu">:</span> <span class="ot">[</span></span>
|
||||
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||||
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"label"</span><span class="fu">:</span> <span class="kw">true</span><span class="fu">,</span></span>
|
||||
@@ -742,11 +748,11 @@ Note
|
||||
<span id="cb7-17"><a href="#cb7-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">"text"</span><span class="fu">:</span> <span class="st">"farewell</s>"</span></span>
|
||||
<span id="cb7-18"><a href="#cb7-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
|
||||
<span id="cb7-19"><a href="#cb7-19" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
|
||||
<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb7-20"><a href="#cb7-20" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>Each prompt must be have a key called <code>segments</code> which is a list of <code>{ text, label }</code>.</p>
|
||||
<div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb8"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> input_output</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> input_output</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>Reference: <a href="../../docs/dataset-formats/template_free.html">Template Free Documentation</a>.</p>
|
||||
</section>
|
||||
<section id="conversation-dataset" class="level3">
|
||||
@@ -771,7 +777,7 @@ Tip
|
||||
<p>Here’s a quick rundown on <code>chat_template</code>: A <code>chat_template</code> is a Jinja2 template which formats a list of messages into a prompt.</p>
|
||||
<p>An example of a prompt formatted into a popular template called ChatML can be seen below:</p>
|
||||
<p>Single prompt (pretty-printed):</p>
|
||||
<div class="sourceCode" id="cb9"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb9"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span></span>
|
||||
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span></span>
|
||||
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="fu">{</span></span>
|
||||
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">"role"</span><span class="fu">:</span> <span class="st">"user"</span><span class="fu">,</span></span>
|
||||
@@ -790,7 +796,7 @@ Tip
|
||||
<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"The answer is 8."</span></span>
|
||||
<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a> <span class="fu">}</span></span>
|
||||
<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a> <span class="ot">]</span></span>
|
||||
<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>The ChatML template is as follows:</p>
|
||||
<pre class="jinja2"><code>{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}</code></pre>
|
||||
<p>The above prompt formatted into this template will result in:</p>
|
||||
@@ -807,9 +813,9 @@ The answer is 8.<|im_end|></code></pre>
|
||||
<section id="common-conversation-dataset-formats" class="level4">
|
||||
<h4 class="anchored" data-anchor-id="common-conversation-dataset-formats">Common Conversation Dataset formats</h4>
|
||||
<p>Older conversation datasets with the following format are colloquially called <code>sharegpt</code> datasets.</p>
|
||||
<div class="sourceCode" id="cb12"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"conversations"</span><span class="fu">:</span> <span class="ot">[</span><span class="fu">{</span><span class="dt">"from"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"value"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">]</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb12"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb12-1"><a href="#cb12-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"conversations"</span><span class="fu">:</span> <span class="ot">[</span><span class="fu">{</span><span class="dt">"from"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"value"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">]</span><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>Newer conversation datasets usually follow the OpenAI format.</p>
|
||||
<div class="sourceCode" id="cb13"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span><span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">]</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb13"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"messages"</span><span class="fu">:</span> <span class="ot">[</span><span class="fu">{</span><span class="dt">"role"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"content"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span><span class="ot">]</span><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>Axolotl supports both as well as allowing customization of any kind of key.</p>
|
||||
</section>
|
||||
<section id="chat-template-usage" class="level4">
|
||||
@@ -825,40 +831,40 @@ The answer is 8.<|im_end|></code></pre>
|
||||
<p>There are a lot of <code>chat_templates</code> out there. Axolotl supports the common ones: <a href="https://github.com/axolotl-ai-cloud/axolotl/blob/860609392184cf62a7e0ca676658b170e059ce6c/src/axolotl/utils/chat_templates.py#L17">supported chat templates</a>. For example, to use ChatML, it would be <code>chat_template: chatml</code>.</p>
|
||||
<p>However, it is also possible to use the already configured template within the tokenizer by specifying <code>chat_template: tokenizer_default</code>. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do <code>chat_template: tokenizer_default_fallback_chatml</code> to fallback to the ChatML template if a tokenizer template was not found.</p>
|
||||
<p>One last but powerful approach is to bring your own template. This can be set via:</p>
|
||||
<div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="co"> # your template</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb14"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="co"> # your template</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
</section>
|
||||
<section id="setting-chat_template-dataset-keys" class="level5">
|
||||
<h5 class="anchored" data-anchor-id="setting-chat_template-dataset-keys">Setting <code>chat_template</code> dataset keys</h5>
|
||||
<p>We currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.</p>
|
||||
<p>If your dataset format is different, here are the keys you should check (with their defaults):</p>
|
||||
<div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb15"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||||
<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">field_messages</span><span class="kw">:</span><span class="at"> messages</span><span class="co"> # this should point to the key containing the list of conversations</span></span>
|
||||
<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">message_property_mappings</span><span class="kw">:</span><span class="co"> # this is a mapping from keys in your dataset to keys in chat_template</span></span>
|
||||
<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">role</span><span class="kw">:</span><span class="at"> role</span></span>
|
||||
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">content</span><span class="kw">:</span><span class="at"> content</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">content</span><span class="kw">:</span><span class="at"> content</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>In some <code>chat_templates</code> (e.g. <a href="https://huggingface.co/google/gemma-2b-it/blob/main/tokenizer_config.json#L1507">Gemma</a>), the roles are hardcoded to <code>user</code> and <code>assistant</code>. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a <code>KeyError</code>, it would be necessary to add mapping for your roles. Here is an example of how it would look like:</p>
|
||||
<div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb16"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb16-1"><a href="#cb16-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb16-2"><a href="#cb16-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||||
<span id="cb16-3"><a href="#cb16-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles</span><span class="kw">:</span></span>
|
||||
<span id="cb16-4"><a href="#cb16-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">assistant</span><span class="kw">:</span></span>
|
||||
<span id="cb16-5"><a href="#cb16-5" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> gpt</span></span>
|
||||
<span id="cb16-6"><a href="#cb16-6" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> model</span></span>
|
||||
<span id="cb16-7"><a href="#cb16-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">user</span><span class="kw">:</span></span>
|
||||
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> human</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb16-8"><a href="#cb16-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> human</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>In the example above, all <code>gpt</code> and <code>model</code> values are converted to <code>assistant</code>. All <code>human</code> values are converted to <code>user.</code></p>
|
||||
</section>
|
||||
<section id="handling-masking" class="level5">
|
||||
<h5 class="anchored" data-anchor-id="handling-masking">Handling masking</h5>
|
||||
<p>The common use case for <code>chat_template</code> is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.</p>
|
||||
<p>To train on all <code>assistant</code> messages, you would set the following configs.</p>
|
||||
<div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb17"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb17-1"><a href="#cb17-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb17-2"><a href="#cb17-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||||
<span id="cb17-3"><a href="#cb17-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"assistant"</span><span class="kw">]</span></span>
|
||||
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> </span><span class="st">"turn"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb17-4"><a href="#cb17-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> </span><span class="st">"turn"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>The <code>train_on_eos</code> config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: <code>all</code> and <code>last</code> to choose which EOS to train on.</p>
|
||||
<p>Perhaps, you want to train on <code>assistant</code> and <code>narrator</code> roles, you can simply add <code>narrator</code> to the list of <code>roles_to_train</code>. You would also need to add it to the mapping of <code>roles</code> above.</p>
|
||||
<div class="sourceCode" id="cb18"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb18"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb18-1"><a href="#cb18-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb18-2"><a href="#cb18-2" aria-hidden="true" tabindex="-1"></a><span class="at"> ...</span></span>
|
||||
<span id="cb18-3"><a href="#cb18-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles_to_train</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"assistant"</span><span class="kw">,</span><span class="at"> </span><span class="st">"narrator"</span><span class="kw">]</span></span>
|
||||
<span id="cb18-4"><a href="#cb18-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">roles</span><span class="kw">:</span></span>
|
||||
@@ -867,7 +873,7 @@ The answer is 8.<|im_end|></code></pre>
|
||||
<span id="cb18-7"><a href="#cb18-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> model</span></span>
|
||||
<span id="cb18-8"><a href="#cb18-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">user</span><span class="kw">:</span></span>
|
||||
<span id="cb18-9"><a href="#cb18-9" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> human</span></span>
|
||||
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">narrator</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"narrator"</span><span class="kw">]</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb18-10"><a href="#cb18-10" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">narrator</span><span class="kw">:</span><span class="at"> </span><span class="kw">[</span><span class="st">"narrator"</span><span class="kw">]</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<div class="callout callout-style-default callout-tip callout-titled">
|
||||
<div class="callout-header d-flex align-content-center">
|
||||
<div class="callout-icon-container">
|
||||
@@ -879,15 +885,15 @@ Tip
|
||||
</div>
|
||||
<div class="callout-body-container callout-body">
|
||||
<p>As chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, <code>ChatML</code> uses <code><|im_end|></code> to end turns.</p>
|
||||
<div class="sourceCode" id="cb19"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span></span>
|
||||
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> <|im_end|></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb19"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb19-1"><a href="#cb19-1" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span></span>
|
||||
<span id="cb19-2"><a href="#cb19-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> <|im_end|></span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="applying-chat_template" class="level5">
|
||||
<h5 class="anchored" data-anchor-id="applying-chat_template">Applying <code>chat_template</code></h5>
|
||||
<p>Once all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.</p>
|
||||
<div class="sourceCode" id="cb20"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb20"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb20-1"><a href="#cb20-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb20-2"><a href="#cb20-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||||
<span id="cb20-3"><a href="#cb20-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> chat_template</span></span>
|
||||
<span id="cb20-4"><a href="#cb20-4" aria-hidden="true" tabindex="-1"></a></span>
|
||||
@@ -914,7 +920,7 @@ Tip
|
||||
<span id="cb20-25"><a href="#cb20-25" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">train_on_eos</span><span class="kw">:</span><span class="at"> </span><span class="st">"turn"</span></span>
|
||||
<span id="cb20-26"><a href="#cb20-26" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb20-27"><a href="#cb20-27" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span></span>
|
||||
<span id="cb20-28"><a href="#cb20-28" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> <|im_end|></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb20-28"><a href="#cb20-28" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> <|im_end|></span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>If this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via <code>axolotl preprocess config.yaml --debug</code>):</p>
|
||||
<pre><code><|im_start|>(-100, 128256) user(-100, 882)
|
||||
(-100, 198) Hi(-100, 13347) <|im_end|>(-100, 128257)
|
||||
@@ -950,7 +956,7 @@ Note
|
||||
<h3 class="anchored" data-anchor-id="instruction-dataset">Instruction Dataset</h3>
|
||||
<p>Instruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.</p>
|
||||
<p>An example is of a common format called Alpaca:</p>
|
||||
<div class="sourceCode" id="cb22"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"instruction"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"output"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb22"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"instruction"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"output"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>Using those keys, a prompt can be built based on it.</p>
|
||||
<pre><code>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
|
||||
|
||||
@@ -963,16 +969,16 @@ Note
|
||||
### Response:
|
||||
{output}</code></pre>
|
||||
<p>This can be configured as such:</p>
|
||||
<div class="sourceCode" id="cb24"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb24"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb24-1"><a href="#cb24-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb24-2"><a href="#cb24-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> A.jsonl</span></span>
|
||||
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> alpaca</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb24-3"><a href="#cb24-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> alpaca</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>Axolotl supports many kinds of instruction dataset. All of them can be found in the <a href="../../docs/dataset-formats/inst_tune.html">Instruction Dataset Documentation</a> with their respective type and sample row format.</p>
|
||||
<section id="custom-instruct-prompt-format" class="level4">
|
||||
<h4 class="anchored" data-anchor-id="custom-instruct-prompt-format">Custom Instruct Prompt Format</h4>
|
||||
<p>Due to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.</p>
|
||||
<p>In the example below, a sample row is used to output in <code>mistral_v1</code> format.</p>
|
||||
<div class="sourceCode" id="cb25"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"output"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb26"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb25"><pre class="sourceCode json code-with-copy"><code class="sourceCode json"><span id="cb25-1"><a href="#cb25-1" aria-hidden="true" tabindex="-1"></a><span class="fu">{</span><span class="dt">"input"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">,</span> <span class="dt">"output"</span><span class="fu">:</span> <span class="st">"..."</span><span class="fu">}</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<div class="code-copy-outer-scaffold"><div class="sourceCode" id="cb26"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb26-1"><a href="#cb26-1" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb26-2"><a href="#cb26-2" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> repo</span></span>
|
||||
<span id="cb26-3"><a href="#cb26-3" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span></span>
|
||||
<span id="cb26-4"><a href="#cb26-4" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">system_prompt</span><span class="kw">:</span><span class="at"> </span><span class="st">""</span></span>
|
||||
@@ -987,7 +993,7 @@ Note
|
||||
<span id="cb26-13"><a href="#cb26-13" aria-hidden="true" tabindex="-1"></a> [INST] {instruction} {input} [/INST]</span>
|
||||
<span id="cb26-14"><a href="#cb26-14" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb26-15"><a href="#cb26-15" aria-hidden="true" tabindex="-1"></a><span class="co"> # single-line example without input</span></span>
|
||||
<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"[INST] {instruction} [/INST]"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<span id="cb26-16"><a href="#cb26-16" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">no_input_format</span><span class="kw">:</span><span class="at"> </span><span class="st">"[INST] {instruction} [/INST]"</span></span></code></pre></div><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></div>
|
||||
<p>The config sets that the <code>field_instruction</code> is actually named <code>input</code>, and the <code>field_input</code> is empty as we don’t have an <code>input</code> in this sample. Generally, <code>instruction</code> can be thought as the question to the model, and <code>input</code> as the additional information with <code>output</code> being the response. It is not necessary to have an <code>input</code> nor <code>system</code>. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.</p>
|
||||
<p>Reference: <a href="../../docs/dataset-formats/inst_tune.html#how-to-add-custom-prompt-format">Custom Instruct Prompt Format Documentation</a>.</p>
|
||||
</section>
|
||||
@@ -1052,13 +1058,14 @@ Note
|
||||
e.clearSelection();
|
||||
}
|
||||
const getTextToCopy = function(trigger) {
|
||||
const codeEl = trigger.previousElementSibling.cloneNode(true);
|
||||
for (const childEl of codeEl.children) {
|
||||
if (isCodeAnnotation(childEl)) {
|
||||
childEl.remove();
|
||||
}
|
||||
const outerScaffold = trigger.parentElement.cloneNode(true);
|
||||
const codeEl = outerScaffold.querySelector('code');
|
||||
for (const childEl of codeEl.children) {
|
||||
if (isCodeAnnotation(childEl)) {
|
||||
childEl.remove();
|
||||
}
|
||||
return codeEl.innerText;
|
||||
}
|
||||
return codeEl.innerText;
|
||||
}
|
||||
const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
|
||||
text: getTextToCopy
|
||||
|
||||
Reference in New Issue
Block a user