Built site for gh-pages
This commit is contained in:
@@ -170,8 +170,11 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<span class="menu-text">docs/fdsp_qlora.qmd</span>
|
||||
</li>
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">FDSP + QLoRA</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/input_output.html" class="sidebar-item-text sidebar-link active">
|
||||
@@ -206,8 +209,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
</li>
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true">
|
||||
<span class="menu-text">Reference</span></a>
|
||||
<a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Dataset Formats</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
@@ -215,6 +218,47 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Conversation</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Instruction Tuning</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Pre-training</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Template-Free</span></a>
|
||||
</div>
|
||||
</li>
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="sidebar-item sidebar-item-section">
|
||||
<div class="sidebar-item-container">
|
||||
<a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" aria-expanded="true">
|
||||
<span class="menu-text">Reference</span></a>
|
||||
<a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" aria-expanded="true" aria-label="Toggle section">
|
||||
<i class="bi bi-chevron-right ms-2"></i>
|
||||
</a>
|
||||
</div>
|
||||
<ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">
|
||||
<li class="sidebar-item">
|
||||
<div class="sidebar-item-container">
|
||||
<a href="../docs/config.html" class="sidebar-item-text sidebar-link">
|
||||
<span class="menu-text">Config options</span></a>
|
||||
</div>
|
||||
@@ -336,29 +380,29 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<section id="prepare-data" class="level3">
|
||||
<h3 class="anchored" data-anchor-id="prepare-data">1. Prepare Data</h3>
|
||||
<p>To use the <code>input_output</code> format, collect your data in the following format into a jsonl file (below is the first row from the file <code>output</code>.jsonl` pretty printed):</p>
|
||||
<div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> head <span class="at">-n1</span> output.jsonl <span class="kw">|</span> <span class="ex">python</span> <span class="at">-m</span> json.tool</span>
|
||||
<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{.cell-output</span> .cell-output-stdout}</span>
|
||||
<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a> <span class="st">"segments"</span><span class="ex">:</span> [</span>
|
||||
<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> true,</span>
|
||||
<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"<s>Hello\n"</span></span>
|
||||
<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a> <span class="ex">},</span></span>
|
||||
<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> true,</span>
|
||||
<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"hi there!. "</span></span>
|
||||
<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a> <span class="ex">},</span></span>
|
||||
<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> false,</span>
|
||||
<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"goodbye "</span></span>
|
||||
<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a> <span class="ex">},</span></span>
|
||||
<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> true,</span>
|
||||
<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"farewell</s>"</span></span>
|
||||
<span id="cb3-21"><a href="#cb3-21" aria-hidden="true" tabindex="-1"></a> <span class="kw">}</span></span>
|
||||
<span id="cb3-22"><a href="#cb3-22" aria-hidden="true" tabindex="-1"></a> <span class="ex">]</span></span>
|
||||
<span id="cb3-23"><a href="#cb3-23" aria-hidden="true" tabindex="-1"></a> <span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb3"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> head <span class="at">-n1</span> output.jsonl <span class="kw">|</span> <span class="ex">python</span> <span class="at">-m</span> json.tool</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="cell-output cell-output-stdout">
|
||||
<pre><code>{
|
||||
"segments": [
|
||||
{
|
||||
"label": true,
|
||||
"text": "<s>Hello\n"
|
||||
},
|
||||
{
|
||||
"label": true,
|
||||
"text": "hi there!. "
|
||||
},
|
||||
{
|
||||
"label": false,
|
||||
"text": "goodbye "
|
||||
},
|
||||
{
|
||||
"label": true,
|
||||
"text": "farewell</s>"
|
||||
}
|
||||
]
|
||||
}</code></pre>
|
||||
</div>
|
||||
<p>Set <code>label:false</code> when you want to mask a segment of text so that the model isn’t trained on it. Some things to keep in mind:</p>
|
||||
<blockquote class="blockquote">
|
||||
<p>[!IMPORTANT] 1. <strong>EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl concatenates all the segments as-is.</strong> The tokenizer doesn’t add anything additional. Notice how I added spaces, newlines, <code><s></code> (BOS), and <code></s></code> (EOS) myself. 2. Make sure you check the materialized output to validate that the prompt is getting assembled how you like.</p>
|
||||
@@ -368,60 +412,60 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
<section id="use-type-input_output" class="level3">
|
||||
<h3 class="anchored" data-anchor-id="use-type-input_output">2. Use <code>type: input_output</code></h3>
|
||||
<p>Let’s materialize data with our <code>output.jsonl</code> file by setting <code>type: input_output</code> in our axolotl config:</p>
|
||||
<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="co"># training_config.yaml</span></span>
|
||||
<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> mistralai/Mistral-7B-v0.1</span></span>
|
||||
<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a><span class="fu">data_seed</span><span class="kw">:</span><span class="at"> </span><span class="dv">49</span></span>
|
||||
<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="fu">seed</span><span class="kw">:</span><span class="at"> </span><span class="dv">49</span></span>
|
||||
<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> output.jsonl</span></span>
|
||||
<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> input_output</span></span>
|
||||
<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.1</span></span>
|
||||
<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_len</span><span class="kw">:</span><span class="at"> </span><span class="dv">896</span></span>
|
||||
<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||||
<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
|
||||
<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span></span>
|
||||
<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
|
||||
<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
|
||||
<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0002</span></span>
|
||||
<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||||
<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span></span>
|
||||
<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> </span><span class="st">"<s>"</span></span>
|
||||
<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> </span><span class="st">"</s>"</span></span>
|
||||
<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> </span><span class="st">"<unk>"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb5"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co"># training_config.yaml</span></span>
|
||||
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> mistralai/Mistral-7B-v0.1</span></span>
|
||||
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="fu">data_seed</span><span class="kw">:</span><span class="at"> </span><span class="dv">49</span></span>
|
||||
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="fu">seed</span><span class="kw">:</span><span class="at"> </span><span class="dv">49</span></span>
|
||||
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
|
||||
<span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> output.jsonl</span></span>
|
||||
<span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">type</span><span class="kw">:</span><span class="at"> input_output</span></span>
|
||||
<span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.1</span></span>
|
||||
<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_len</span><span class="kw">:</span><span class="at"> </span><span class="dv">896</span></span>
|
||||
<span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||||
<span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
|
||||
<span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> </span><span class="dv">3</span></span>
|
||||
<span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">2</span></span>
|
||||
<span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> </span><span class="dv">1</span></span>
|
||||
<span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> </span><span class="fl">0.0002</span></span>
|
||||
<span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
|
||||
<span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span></span>
|
||||
<span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> </span><span class="st">"<s>"</span></span>
|
||||
<span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> </span><span class="st">"</s>"</span></span>
|
||||
<span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a><span class="at"> </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> </span><span class="st">"<unk>"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<p>You can use the following command to materialize your data. The <code>--debug</code> flag will print the tokens, along with the labels so you can verify that the correct items are being ignored:</p>
|
||||
<div class="sourceCode" id="cb5"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> python <span class="at">-m</span> axolotl.cli.preprocess training_config.yaml <span class="at">--debug</span></span>
|
||||
<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="ex">...</span></span>
|
||||
<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a><span class="ex">[2024-03-05</span> 23:36:46,969] <span class="pp">[</span><span class="ss">INFO</span><span class="pp">]</span> <span class="pp">[</span><span class="ss">axolotl.check_example_labels:35</span><span class="pp">]</span> <span class="pp">[</span><span class="ss">PID:607731</span><span class="pp">]</span> <span class="pp">[</span><span class="ss">RANK:0</span><span class="pp">]</span> <span class="op"><</span>s<span class="op">>(</span><span class="ex">1,</span> 1<span class="op">)</span> Hello<span class="er">(</span><span class="ex">22557,</span> 22557<span class="kw">)</span></span>
|
||||
<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a><span class="kw">(</span><span class="ex">13,</span> 13<span class="kw">)</span> <span class="ex">hi</span><span class="er">(</span><span class="ex">12014,</span> 12014<span class="kw">)</span> <span class="ex">there</span><span class="er">(</span><span class="ex">736,</span> 736<span class="kw">)</span> <span class="ex">!</span><span class="er">(</span><span class="ex">28808,</span> 28808<span class="kw">)</span> <span class="bu">.</span><span class="er">(</span><span class="ex">28723,</span> 28723<span class="kw">)</span> <span class="kw">(</span><span class="ex">28705,</span> 28705<span class="kw">)</span> <span class="ex">good</span><span class="er">(</span><span class="ex">-100,</span> 1179<span class="kw">)</span> <span class="ex">bye</span><span class="er">(</span><span class="ex">-100,</span> 17664<span class="kw">)</span> <span class="kw">(</span><span class="ex">-100,</span> 28705<span class="kw">)</span> <span class="ex">fare</span><span class="er">(</span><span class="ex">19111,</span> 19111<span class="kw">)</span> <span class="ex">well</span><span class="er">(</span><span class="ex">5458,</span> 5458<span class="kw">)</span> <span class="op"><</span>/s<span class="op">>(</span><span class="ex">2,</span> 2<span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb6"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> python <span class="at">-m</span> axolotl.cli.preprocess training_config.yaml <span class="at">--debug</span></span>
|
||||
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="ex">...</span></span>
|
||||
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a><span class="ex">[2024-03-05</span> 23:36:46,969] <span class="pp">[</span><span class="ss">INFO</span><span class="pp">]</span> <span class="pp">[</span><span class="ss">axolotl.check_example_labels:35</span><span class="pp">]</span> <span class="pp">[</span><span class="ss">PID:607731</span><span class="pp">]</span> <span class="pp">[</span><span class="ss">RANK:0</span><span class="pp">]</span> <span class="op"><</span>s<span class="op">>(</span><span class="ex">1,</span> 1<span class="op">)</span> Hello<span class="er">(</span><span class="ex">22557,</span> 22557<span class="kw">)</span></span>
|
||||
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a><span class="kw">(</span><span class="ex">13,</span> 13<span class="kw">)</span> <span class="ex">hi</span><span class="er">(</span><span class="ex">12014,</span> 12014<span class="kw">)</span> <span class="ex">there</span><span class="er">(</span><span class="ex">736,</span> 736<span class="kw">)</span> <span class="ex">!</span><span class="er">(</span><span class="ex">28808,</span> 28808<span class="kw">)</span> <span class="bu">.</span><span class="er">(</span><span class="ex">28723,</span> 28723<span class="kw">)</span> <span class="kw">(</span><span class="ex">28705,</span> 28705<span class="kw">)</span> <span class="ex">good</span><span class="er">(</span><span class="ex">-100,</span> 1179<span class="kw">)</span> <span class="ex">bye</span><span class="er">(</span><span class="ex">-100,</span> 17664<span class="kw">)</span> <span class="kw">(</span><span class="ex">-100,</span> 28705<span class="kw">)</span> <span class="ex">fare</span><span class="er">(</span><span class="ex">19111,</span> 19111<span class="kw">)</span> <span class="ex">well</span><span class="er">(</span><span class="ex">5458,</span> 5458<span class="kw">)</span> <span class="op"><</span>/s<span class="op">>(</span><span class="ex">2,</span> 2<span class="op">)</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<p>The format is <code>decoded_token</code>(<code>label</code>, <code>token_id</code>), for example, <code><s>(1, 1)</code> means that the token is <code><s></code>, the label is <code>1</code> and the token_id is <code>1</code>. When the label is <code>-100</code> then that token is ignored for training.</p>
|
||||
<p><a id="markdown-3-check-the-prompts" name="3-check-the-prompts"></a></p>
|
||||
</section>
|
||||
<section id="check-the-prompts" class="level3">
|
||||
<h3 class="anchored" data-anchor-id="check-the-prompts">3. Check the prompts</h3>
|
||||
<p>Here is another way to check the materialized output:</p>
|
||||
<div class="sourceCode" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> AutoTokenizer</span>
|
||||
<span id="cb6-2"><a href="#cb6-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> datasets <span class="im">import</span> load_from_disk</span>
|
||||
<span id="cb6-3"><a href="#cb6-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
|
||||
<span id="cb6-4"><a href="#cb6-4" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb6-5"><a href="#cb6-5" aria-hidden="true" tabindex="-1"></a>directory <span class="op">=</span> <span class="op">!</span>ls last_run_prepared<span class="op">/</span></span>
|
||||
<span id="cb6-6"><a href="#cb6-6" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">'training_config.yaml'</span>, <span class="st">'r'</span>) <span class="im">as</span> f:</span>
|
||||
<span id="cb6-7"><a href="#cb6-7" aria-hidden="true" tabindex="-1"></a> cfg <span class="op">=</span> yaml.safe_load(f)</span>
|
||||
<span id="cb6-8"><a href="#cb6-8" aria-hidden="true" tabindex="-1"></a>model_id <span class="op">=</span> cfg[<span class="st">'base_model'</span>]</span>
|
||||
<span id="cb6-9"><a href="#cb6-9" aria-hidden="true" tabindex="-1"></a>tok <span class="op">=</span> AutoTokenizer.from_pretrained(model_id)</span>
|
||||
<span id="cb6-10"><a href="#cb6-10" aria-hidden="true" tabindex="-1"></a>ds <span class="op">=</span> load_from_disk(<span class="ss">f'last_run_prepared/</span><span class="sc">{</span>directory[<span class="dv">0</span>]<span class="sc">}</span><span class="ss">/'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="op">>>></span> row <span class="op">=</span> ds[<span class="dv">0</span>]</span>
|
||||
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="op">>>></span> <span class="bu">print</span>(tok.decode(row[<span class="st">'input_ids'</span>]))</span>
|
||||
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="op"><</span>s<span class="op">></span> Hello</span>
|
||||
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a> hi there<span class="op">!</span>. goodbye farewell<span class="op"></</span>s<span class="op">></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> AutoTokenizer</span>
|
||||
<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> datasets <span class="im">import</span> load_from_disk</span>
|
||||
<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> yaml</span>
|
||||
<span id="cb7-4"><a href="#cb7-4" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb7-5"><a href="#cb7-5" aria-hidden="true" tabindex="-1"></a>directory <span class="op">=</span> <span class="op">!</span>ls last_run_prepared<span class="op">/</span></span>
|
||||
<span id="cb7-6"><a href="#cb7-6" aria-hidden="true" tabindex="-1"></a><span class="cf">with</span> <span class="bu">open</span>(<span class="st">'training_config.yaml'</span>, <span class="st">'r'</span>) <span class="im">as</span> f:</span>
|
||||
<span id="cb7-7"><a href="#cb7-7" aria-hidden="true" tabindex="-1"></a> cfg <span class="op">=</span> yaml.safe_load(f)</span>
|
||||
<span id="cb7-8"><a href="#cb7-8" aria-hidden="true" tabindex="-1"></a>model_id <span class="op">=</span> cfg[<span class="st">'base_model'</span>]</span>
|
||||
<span id="cb7-9"><a href="#cb7-9" aria-hidden="true" tabindex="-1"></a>tok <span class="op">=</span> AutoTokenizer.from_pretrained(model_id)</span>
|
||||
<span id="cb7-10"><a href="#cb7-10" aria-hidden="true" tabindex="-1"></a>ds <span class="op">=</span> load_from_disk(<span class="ss">f'last_run_prepared/</span><span class="sc">{</span>directory[<span class="dv">0</span>]<span class="sc">}</span><span class="ss">/'</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="op">>>></span> row <span class="op">=</span> ds[<span class="dv">0</span>]</span>
|
||||
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a><span class="op">>>></span> <span class="bu">print</span>(tok.decode(row[<span class="st">'input_ids'</span>]))</span>
|
||||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a><span class="op"><</span>s<span class="op">></span> Hello</span>
|
||||
<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a> hi there<span class="op">!</span>. goodbye farewell<span class="op"></</span>s<span class="op">></span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<p>We can check that the right tokens are ingored by comparing the labels to each token:</p>
|
||||
<div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
|
||||
<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>pd.DataFrame([{<span class="st">'token'</span>: tok.decode(i), <span class="st">'label'</span>: l, <span class="st">'id'</span>:i} <span class="cf">for</span> i,l <span class="kw">in</span></span>
|
||||
<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a> <span class="bu">zip</span>(row[<span class="st">'input_ids'</span>], row[<span class="st">'labels'</span>])])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> pandas <span class="im">as</span> pd</span>
|
||||
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>pd.DataFrame([{<span class="st">'token'</span>: tok.decode(i), <span class="st">'label'</span>: l, <span class="st">'id'</span>:i} <span class="cf">for</span> i,l <span class="kw">in</span></span>
|
||||
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a> <span class="bu">zip</span>(row[<span class="st">'input_ids'</span>], row[<span class="st">'labels'</span>])])</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr class="header">
|
||||
@@ -504,29 +548,29 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
|
||||
</tbody>
|
||||
</table>
|
||||
<p>If we look at the input data, the above table seems correct! (The jsonl version is repeated below for reference):</p>
|
||||
<div class="sourceCode" id="cb9"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> head <span class="at">-n1</span> output.jsonl <span class="kw">|</span> <span class="ex">python</span> <span class="at">-m</span> json.tool</span>
|
||||
<span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a></span>
|
||||
<span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a><span class="ex">{.cell-output</span> .cell-output-stdout}</span>
|
||||
<span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a> <span class="st">"segments"</span><span class="ex">:</span> [</span>
|
||||
<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> true,</span>
|
||||
<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"<s>Hello\n"</span></span>
|
||||
<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a> <span class="ex">},</span></span>
|
||||
<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> true,</span>
|
||||
<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"hi there!. "</span></span>
|
||||
<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a> <span class="ex">},</span></span>
|
||||
<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> false,</span>
|
||||
<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"goodbye "</span></span>
|
||||
<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a> <span class="ex">},</span></span>
|
||||
<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a> <span class="kw">{</span></span>
|
||||
<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a> <span class="st">"label"</span><span class="ex">:</span> true,</span>
|
||||
<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a> <span class="st">"text"</span><span class="ex">:</span> <span class="st">"farewell</s>"</span></span>
|
||||
<span id="cb9-21"><a href="#cb9-21" aria-hidden="true" tabindex="-1"></a> <span class="kw">}</span></span>
|
||||
<span id="cb9-22"><a href="#cb9-22" aria-hidden="true" tabindex="-1"></a> <span class="ex">]</span></span>
|
||||
<span id="cb9-23"><a href="#cb9-23" aria-hidden="true" tabindex="-1"></a> <span class="kw">}</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="sourceCode" id="cb10"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a><span class="ex">$</span> head <span class="at">-n1</span> output.jsonl <span class="kw">|</span> <span class="ex">python</span> <span class="at">-m</span> json.tool</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
|
||||
<div class="cell-output cell-output-stdout">
|
||||
<pre><code>{
|
||||
"segments": [
|
||||
{
|
||||
"label": true,
|
||||
"text": "<s>Hello\n"
|
||||
},
|
||||
{
|
||||
"label": true,
|
||||
"text": "hi there!. "
|
||||
},
|
||||
{
|
||||
"label": false,
|
||||
"text": "goodbye "
|
||||
},
|
||||
{
|
||||
"label": true,
|
||||
"text": "farewell</s>"
|
||||
}
|
||||
]
|
||||
}</code></pre>
|
||||
</div>
|
||||
|
||||
|
||||
</section>
|
||||
|
||||
Reference in New Issue
Block a user