diff --git a/.nojekyll b/.nojekyll
index 841139098..2fea0cccc 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-29a669bc
\ No newline at end of file
+dd7bb740
\ No newline at end of file
diff --git a/docs/api/core.datasets.transforms.chat_builder.html b/docs/api/core.datasets.transforms.chat_builder.html
index 7ec2dc765..4d27fe146 100644
--- a/docs/api/core.datasets.transforms.chat_builder.html
+++ b/docs/api/core.datasets.transforms.chat_builder.html
@@ -510,7 +510,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <section id="axolotl.core.datasets.transforms.chat_builder" class="level1">
 <h1>core.datasets.transforms.chat_builder</h1>
 <p><code>core.datasets.transforms.chat_builder</code></p>
-<p>This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.</p>
+<p>This module contains a function that builds a transform that takes a row from the
+dataset and converts it to a Chat.</p>
 <section id="functions" class="level2">
 <h2 class="anchored" data-anchor-id="functions">Functions</h2>
 <table class="caption-top table">
@@ -532,19 +533,19 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>core.datasets.transforms.chat_builder.chat_message_transform_builder(</span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>    train_on_inputs<span class="op">=</span><span class="va">False</span>,</span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>    conversations_field<span class="op">=</span><span class="st">'conversations'</span>,</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>    message_field_role<span class="op">=</span>[<span class="st">'role'</span>, <span class="st">'from'</span>],</span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>    message_field_content<span class="op">=</span>[<span class="st">'value'</span>, <span class="st">'text'</span>, <span class="st">'content'</span>],</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>    message_field_training<span class="op">=</span>[<span class="st">'train'</span>, <span class="st">'weight'</span>],</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>    message_field_role<span class="op">=</span><span class="va">None</span>,</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>    message_field_content<span class="op">=</span><span class="va">None</span>,</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>    message_field_training<span class="op">=</span><span class="va">None</span>,</span>
 <span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Builds a transform that takes a row from the dataset and converts it to a Chat</p>
 <section id="parameters" class="level4 doc-section doc-section-parameters">
 <h4 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters">Parameters</h4>
 <table class="caption-top table">
 <colgroup>
-<col style="width: 12%">
+<col style="width: 13%">
+<col style="width: 10%">
+<col style="width: 65%">
 <col style="width: 10%">
-<col style="width: 61%">
-<col style="width: 16%">
 </colgroup>
 <thead>
 <tr class="header">
@@ -571,19 +572,19 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <td>message_field_role</td>
 <td>str | list[str]</td>
 <td>The field name of the role. Defaults to “role”.</td>
-<td><code>['role', 'from']</code></td>
+<td><code>None</code></td>
 </tr>
 <tr class="even">
 <td>message_field_content</td>
 <td>str | list[str]</td>
 <td>The field name of the message content. Defaults to “content”.</td>
-<td><code>['value', 'text', 'content']</code></td>
+<td><code>None</code></td>
 </tr>
 <tr class="odd">
 <td>message_field_training</td>
 <td>str | list[str]</td>
 <td>The field name of the train/weight. Defaults to “weight”.</td>
-<td><code>['train', 'weight']</code></td>
+<td><code>None</code></td>
 </tr>
 </tbody>
 </table>
diff --git a/docs/api/index.html b/docs/api/index.html
index a58fca786..0d53ec8f6 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -553,7 +553,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </tr>
 <tr class="even">
 <td><a href="../../docs/api/core.datasets.transforms.chat_builder.html#axolotl.core.datasets.transforms.chat_builder">core.datasets.transforms.chat_builder</a></td>
-<td>This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.</td>
+<td>This module contains a function that builds a transform that takes a row from the</td>
 </tr>
 </tbody>
 </table>
diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html
index 13bae39ad..8342ba20f 100644
--- a/examples/colab-notebooks/colab-axolotl-example.html
+++ b/examples/colab-notebooks/colab-axolotl-example.html
@@ -563,21 +563,23 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </div>
 <div id="cell-7" class="cell">
 <div class="sourceCode cell-code" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> os</span>
-<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="co"># Optionally, upload your own JSONL to your Google Drive</span></span>
-<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>GOOGLE_DRIVE_PATH <span class="op">=</span> <span class="st">""</span>  <span class="co"># ex: "MyDrive/Colab\ Notebooks/train.jsonl"</span></span>
-<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="co"># "Select All" permissions, or you may get the error:</span></span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="co"># "MessageError: Error: credential propagation was unsuccessful"</span></span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> GOOGLE_DRIVE_PATH:</span>
-<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    <span class="im">from</span> google.colab <span class="im">import</span> drive</span>
-<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Mount your Google Drive</span></span>
-<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    GOOGLE_DRIVE_MNT <span class="op">=</span> <span class="st">"/content/drive/"</span></span>
-<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    drive.mount(GOOGLE_DRIVE_MNT, force_remount<span class="op">=</span><span class="va">True</span>)</span>
-<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>    tmp_path <span class="op">=</span> os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(<span class="st">"/"</span>))</span>
-<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    <span class="co"># make sure file exists</span></span>
-<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="kw">not</span> os.path.isfile(tmp_path):</span>
-<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>        <span class="cf">raise</span> <span class="pp">ValueError</span>(<span class="ss">f"File </span><span class="sc">{</span>tmp_path<span class="sc">}</span><span class="ss"> does not exist"</span>)</span>
-<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>    dataset_id <span class="op">=</span> tmp_path</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="co"># Optionally, upload your own JSONL to your Google Drive</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>GOOGLE_DRIVE_PATH <span class="op">=</span> <span class="st">""</span>  <span class="co"># ex: "MyDrive/Colab\ Notebooks/train.jsonl"</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a><span class="co"># "Select All" permissions, or you may get the error:</span></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co"># "MessageError: Error: credential propagation was unsuccessful"</span></span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> GOOGLE_DRIVE_PATH:</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    <span class="im">from</span> google.colab <span class="im">import</span> drive</span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    <span class="co"># Mount your Google Drive</span></span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>    GOOGLE_DRIVE_MNT <span class="op">=</span> <span class="st">"/content/drive/"</span></span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    drive.mount(GOOGLE_DRIVE_MNT, force_remount<span class="op">=</span><span class="va">True</span>)</span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    tmp_path <span class="op">=</span> os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(<span class="st">"/"</span>))</span>
+<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>    <span class="co"># make sure file exists</span></span>
+<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>    <span class="cf">if</span> <span class="kw">not</span> os.path.isfile(tmp_path):</span>
+<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>        <span class="cf">raise</span> <span class="pp">ValueError</span>(<span class="ss">f"File </span><span class="sc">{</span>tmp_path<span class="sc">}</span><span class="ss"> does not exist"</span>)</span>
+<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>    dataset_id <span class="op">=</span> tmp_path</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
 </section>
@@ -590,61 +592,66 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># Axolotl provides full control and transparency over model and training configuration</span></span>
 <span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>config <span class="op">=</span> DictDefault(</span>
-<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>    base_model <span class="op">=</span> <span class="st">"Qwen/Qwen3-14B"</span>,  <span class="co"># Use the instruct tuned model, but we're aligning it to be a pirate</span></span>
-<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>    load_in_4bit <span class="op">=</span> <span class="va">True</span>,  <span class="co"># set to True for qLoRA</span></span>
-<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>    adapter <span class="op">=</span> <span class="st">"qlora"</span>,</span>
-<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>    lora_r <span class="op">=</span> <span class="dv">32</span>,</span>
-<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>    lora_alpha <span class="op">=</span> <span class="dv">64</span>,</span>
-<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>    lora_target_modules <span class="op">=</span> [</span>
-<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>        <span class="st">"q_proj"</span>, <span class="st">"k_proj"</span>, <span class="st">"v_proj"</span>, <span class="st">"o_proj"</span>,  <span class="co"># train self_attn linear modules</span></span>
-<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>        <span class="st">"gate_proj"</span>, <span class="st">"down_proj"</span>, <span class="st">"up_proj"</span>,  <span class="co"># train MLP linear modules</span></span>
-<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>    ],</span>
-<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>    lora_qkv_kernel <span class="op">=</span> <span class="va">True</span>,  <span class="co"># optimized triton kernels for LoRA</span></span>
-<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>    lora_o_kernel <span class="op">=</span> <span class="va">True</span>,</span>
-<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>    lora_mlp_kernel <span class="op">=</span> <span class="va">True</span>,</span>
-<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>    embeddings_skip_upcast <span class="op">=</span> <span class="va">True</span>,  <span class="co"># keep embeddings in fp16 so the model fits in 15GB VRAM</span></span>
-<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>    xformers_attention <span class="op">=</span> <span class="va">True</span>,  <span class="co"># use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above</span></span>
-<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>    plugins <span class="op">=</span> [</span>
-<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>        <span class="co"># more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy</span></span>
-<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>        <span class="st">"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"</span>,</span>
-<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a>    ],</span>
-<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>    sample_packing <span class="op">=</span> <span class="va">True</span>,  <span class="co"># 2-6x increase in tokens per micro-batch</span></span>
-<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>    <span class="co"># when using packing, use a slightly higher learning rate to account for fewer steps</span></span>
-<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>    <span class="co"># alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch</span></span>
-<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>    learning_rate <span class="op">=</span> <span class="fl">0.00019</span>,</span>
-<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>    sequence_len <span class="op">=</span> <span class="dv">4096</span>,  <span class="co"># larger sequence length improves packing efficiency for more tokens/sec</span></span>
-<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>    micro_batch_size <span class="op">=</span> <span class="dv">1</span>,</span>
-<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>    gradient_accumulation_steps <span class="op">=</span> <span class="dv">1</span>,</span>
-<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a>    gradient_checkpointing <span class="op">=</span> <span class="va">True</span>,  <span class="co"># tradeoff reduced VRAM for increased time</span></span>
-<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>    gradient_checkpointing_kwargs <span class="op">=</span> {</span>
-<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a>        <span class="st">"use_reentrant"</span>: <span class="va">False</span>,</span>
-<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a>    },</span>
-<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a>    optimizer <span class="op">=</span> <span class="st">"paged_adamw_8bit"</span>,</span>
-<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>    lr_scheduler <span class="op">=</span> <span class="st">"cosine"</span>,</span>
-<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a>    warmup_steps <span class="op">=</span> <span class="dv">5</span>,</span>
-<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>    fp16 <span class="op">=</span> <span class="va">True</span>,  <span class="co"># use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4</span></span>
-<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a>    bf16 <span class="op">=</span> <span class="va">False</span>,</span>
-<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>    max_grad_norm <span class="op">=</span> <span class="fl">0.1</span>,  <span class="co"># gradient clipping</span></span>
-<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a>    num_epochs <span class="op">=</span> <span class="dv">1</span>,</span>
-<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>    saves_per_epoch <span class="op">=</span> <span class="dv">2</span>,  <span class="co"># how many checkpoints to save over one epoch</span></span>
-<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a>    logging_steps <span class="op">=</span> <span class="dv">1</span>,</span>
-<span id="cb4-44"><a href="#cb4-44" aria-hidden="true" tabindex="-1"></a>    output_dir <span class="op">=</span> <span class="st">"./outputs/qwen-sft-pirate-rrr"</span>,</span>
-<span id="cb4-45"><a href="#cb4-45" aria-hidden="true" tabindex="-1"></a>    chat_template <span class="op">=</span> <span class="st">"qwen3"</span>,</span>
-<span id="cb4-46"><a href="#cb4-46" aria-hidden="true" tabindex="-1"></a>    datasets <span class="op">=</span> [</span>
-<span id="cb4-47"><a href="#cb4-47" aria-hidden="true" tabindex="-1"></a>        {</span>
-<span id="cb4-48"><a href="#cb4-48" aria-hidden="true" tabindex="-1"></a>            <span class="st">"path"</span>: dataset_id,  <span class="co"># Huggingface Dataset id or path to train.jsonl</span></span>
-<span id="cb4-49"><a href="#cb4-49" aria-hidden="true" tabindex="-1"></a>            <span class="st">"type"</span>: <span class="st">"chat_template"</span>,</span>
-<span id="cb4-50"><a href="#cb4-50" aria-hidden="true" tabindex="-1"></a>            <span class="st">"split"</span>: <span class="st">"train"</span>,</span>
-<span id="cb4-51"><a href="#cb4-51" aria-hidden="true" tabindex="-1"></a>            <span class="st">"eot_tokens"</span>: [<span class="st">"&lt;|im_end|&gt;"</span>],</span>
-<span id="cb4-52"><a href="#cb4-52" aria-hidden="true" tabindex="-1"></a>        }</span>
-<span id="cb4-53"><a href="#cb4-53" aria-hidden="true" tabindex="-1"></a>    ],</span>
-<span id="cb4-54"><a href="#cb4-54" aria-hidden="true" tabindex="-1"></a>    dataloader_prefetch_factor <span class="op">=</span> <span class="dv">8</span>,  <span class="co"># dataloader optimizations</span></span>
-<span id="cb4-55"><a href="#cb4-55" aria-hidden="true" tabindex="-1"></a>    dataloader_num_workers <span class="op">=</span> <span class="dv">2</span>,</span>
-<span id="cb4-56"><a href="#cb4-56" aria-hidden="true" tabindex="-1"></a>    dataloader_pin_memory <span class="op">=</span> <span class="va">True</span>,</span>
-<span id="cb4-57"><a href="#cb4-57" aria-hidden="true" tabindex="-1"></a>  )</span>
-<span id="cb4-58"><a href="#cb4-58" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb4-59"><a href="#cb4-59" aria-hidden="true" tabindex="-1"></a><span class="co"># validates the configuration</span></span>
-<span id="cb4-60"><a href="#cb4-60" aria-hidden="true" tabindex="-1"></a>cfg <span class="op">=</span> load_cfg(config)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a>    base_model<span class="op">=</span><span class="st">"Qwen/Qwen3-14B"</span>,  <span class="co"># Use the instruct tuned model, but we're aligning it to be a pirate</span></span>
+<span id="cb4-7"><a href="#cb4-7" aria-hidden="true" tabindex="-1"></a>    load_in_4bit<span class="op">=</span><span class="va">True</span>,  <span class="co"># set to True for qLoRA</span></span>
+<span id="cb4-8"><a href="#cb4-8" aria-hidden="true" tabindex="-1"></a>    adapter<span class="op">=</span><span class="st">"qlora"</span>,</span>
+<span id="cb4-9"><a href="#cb4-9" aria-hidden="true" tabindex="-1"></a>    lora_r<span class="op">=</span><span class="dv">32</span>,</span>
+<span id="cb4-10"><a href="#cb4-10" aria-hidden="true" tabindex="-1"></a>    lora_alpha<span class="op">=</span><span class="dv">64</span>,</span>
+<span id="cb4-11"><a href="#cb4-11" aria-hidden="true" tabindex="-1"></a>    lora_target_modules<span class="op">=</span>[</span>
+<span id="cb4-12"><a href="#cb4-12" aria-hidden="true" tabindex="-1"></a>        <span class="st">"q_proj"</span>,</span>
+<span id="cb4-13"><a href="#cb4-13" aria-hidden="true" tabindex="-1"></a>        <span class="st">"k_proj"</span>,</span>
+<span id="cb4-14"><a href="#cb4-14" aria-hidden="true" tabindex="-1"></a>        <span class="st">"v_proj"</span>,</span>
+<span id="cb4-15"><a href="#cb4-15" aria-hidden="true" tabindex="-1"></a>        <span class="st">"o_proj"</span>,  <span class="co"># train self_attn linear modules</span></span>
+<span id="cb4-16"><a href="#cb4-16" aria-hidden="true" tabindex="-1"></a>        <span class="st">"gate_proj"</span>,</span>
+<span id="cb4-17"><a href="#cb4-17" aria-hidden="true" tabindex="-1"></a>        <span class="st">"down_proj"</span>,</span>
+<span id="cb4-18"><a href="#cb4-18" aria-hidden="true" tabindex="-1"></a>        <span class="st">"up_proj"</span>,  <span class="co"># train MLP linear modules</span></span>
+<span id="cb4-19"><a href="#cb4-19" aria-hidden="true" tabindex="-1"></a>    ],</span>
+<span id="cb4-20"><a href="#cb4-20" aria-hidden="true" tabindex="-1"></a>    lora_qkv_kernel<span class="op">=</span><span class="va">True</span>,  <span class="co"># optimized triton kernels for LoRA</span></span>
+<span id="cb4-21"><a href="#cb4-21" aria-hidden="true" tabindex="-1"></a>    lora_o_kernel<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb4-22"><a href="#cb4-22" aria-hidden="true" tabindex="-1"></a>    lora_mlp_kernel<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb4-23"><a href="#cb4-23" aria-hidden="true" tabindex="-1"></a>    embeddings_skip_upcast<span class="op">=</span><span class="va">True</span>,  <span class="co"># keep embeddings in fp16 so the model fits in 15GB VRAM</span></span>
+<span id="cb4-24"><a href="#cb4-24" aria-hidden="true" tabindex="-1"></a>    xformers_attention<span class="op">=</span><span class="va">True</span>,  <span class="co"># use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above</span></span>
+<span id="cb4-25"><a href="#cb4-25" aria-hidden="true" tabindex="-1"></a>    plugins<span class="op">=</span>[</span>
+<span id="cb4-26"><a href="#cb4-26" aria-hidden="true" tabindex="-1"></a>        <span class="co"># more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy</span></span>
+<span id="cb4-27"><a href="#cb4-27" aria-hidden="true" tabindex="-1"></a>        <span class="st">"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin"</span>,</span>
+<span id="cb4-28"><a href="#cb4-28" aria-hidden="true" tabindex="-1"></a>    ],</span>
+<span id="cb4-29"><a href="#cb4-29" aria-hidden="true" tabindex="-1"></a>    sample_packing<span class="op">=</span><span class="va">True</span>,  <span class="co"># 2-6x increase in tokens per micro-batch</span></span>
+<span id="cb4-30"><a href="#cb4-30" aria-hidden="true" tabindex="-1"></a>    <span class="co"># when using packing, use a slightly higher learning rate to account for fewer steps</span></span>
+<span id="cb4-31"><a href="#cb4-31" aria-hidden="true" tabindex="-1"></a>    <span class="co"># alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch</span></span>
+<span id="cb4-32"><a href="#cb4-32" aria-hidden="true" tabindex="-1"></a>    learning_rate<span class="op">=</span><span class="fl">0.00019</span>,</span>
+<span id="cb4-33"><a href="#cb4-33" aria-hidden="true" tabindex="-1"></a>    sequence_len<span class="op">=</span><span class="dv">4096</span>,  <span class="co"># larger sequence length improves packing efficiency for more tokens/sec</span></span>
+<span id="cb4-34"><a href="#cb4-34" aria-hidden="true" tabindex="-1"></a>    micro_batch_size<span class="op">=</span><span class="dv">1</span>,</span>
+<span id="cb4-35"><a href="#cb4-35" aria-hidden="true" tabindex="-1"></a>    gradient_accumulation_steps<span class="op">=</span><span class="dv">1</span>,</span>
+<span id="cb4-36"><a href="#cb4-36" aria-hidden="true" tabindex="-1"></a>    gradient_checkpointing<span class="op">=</span><span class="va">True</span>,  <span class="co"># tradeoff reduced VRAM for increased time</span></span>
+<span id="cb4-37"><a href="#cb4-37" aria-hidden="true" tabindex="-1"></a>    gradient_checkpointing_kwargs<span class="op">=</span>{</span>
+<span id="cb4-38"><a href="#cb4-38" aria-hidden="true" tabindex="-1"></a>        <span class="st">"use_reentrant"</span>: <span class="va">False</span>,</span>
+<span id="cb4-39"><a href="#cb4-39" aria-hidden="true" tabindex="-1"></a>    },</span>
+<span id="cb4-40"><a href="#cb4-40" aria-hidden="true" tabindex="-1"></a>    optimizer<span class="op">=</span><span class="st">"paged_adamw_8bit"</span>,</span>
+<span id="cb4-41"><a href="#cb4-41" aria-hidden="true" tabindex="-1"></a>    lr_scheduler<span class="op">=</span><span class="st">"cosine"</span>,</span>
+<span id="cb4-42"><a href="#cb4-42" aria-hidden="true" tabindex="-1"></a>    warmup_steps<span class="op">=</span><span class="dv">5</span>,</span>
+<span id="cb4-43"><a href="#cb4-43" aria-hidden="true" tabindex="-1"></a>    fp16<span class="op">=</span><span class="va">True</span>,  <span class="co"># use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4</span></span>
+<span id="cb4-44"><a href="#cb4-44" aria-hidden="true" tabindex="-1"></a>    bf16<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb4-45"><a href="#cb4-45" aria-hidden="true" tabindex="-1"></a>    max_grad_norm<span class="op">=</span><span class="fl">0.1</span>,  <span class="co"># gradient clipping</span></span>
+<span id="cb4-46"><a href="#cb4-46" aria-hidden="true" tabindex="-1"></a>    num_epochs<span class="op">=</span><span class="dv">1</span>,</span>
+<span id="cb4-47"><a href="#cb4-47" aria-hidden="true" tabindex="-1"></a>    saves_per_epoch<span class="op">=</span><span class="dv">2</span>,  <span class="co"># how many checkpoints to save over one epoch</span></span>
+<span id="cb4-48"><a href="#cb4-48" aria-hidden="true" tabindex="-1"></a>    logging_steps<span class="op">=</span><span class="dv">1</span>,</span>
+<span id="cb4-49"><a href="#cb4-49" aria-hidden="true" tabindex="-1"></a>    output_dir<span class="op">=</span><span class="st">"./outputs/qwen-sft-pirate-rrr"</span>,</span>
+<span id="cb4-50"><a href="#cb4-50" aria-hidden="true" tabindex="-1"></a>    chat_template<span class="op">=</span><span class="st">"qwen3"</span>,</span>
+<span id="cb4-51"><a href="#cb4-51" aria-hidden="true" tabindex="-1"></a>    datasets<span class="op">=</span>[</span>
+<span id="cb4-52"><a href="#cb4-52" aria-hidden="true" tabindex="-1"></a>        {</span>
+<span id="cb4-53"><a href="#cb4-53" aria-hidden="true" tabindex="-1"></a>            <span class="st">"path"</span>: dataset_id,  <span class="co"># Huggingface Dataset id or path to train.jsonl</span></span>
+<span id="cb4-54"><a href="#cb4-54" aria-hidden="true" tabindex="-1"></a>            <span class="st">"type"</span>: <span class="st">"chat_template"</span>,</span>
+<span id="cb4-55"><a href="#cb4-55" aria-hidden="true" tabindex="-1"></a>            <span class="st">"split"</span>: <span class="st">"train"</span>,</span>
+<span id="cb4-56"><a href="#cb4-56" aria-hidden="true" tabindex="-1"></a>            <span class="st">"eot_tokens"</span>: [<span class="st">"&lt;|im_end|&gt;"</span>],</span>
+<span id="cb4-57"><a href="#cb4-57" aria-hidden="true" tabindex="-1"></a>        }</span>
+<span id="cb4-58"><a href="#cb4-58" aria-hidden="true" tabindex="-1"></a>    ],</span>
+<span id="cb4-59"><a href="#cb4-59" aria-hidden="true" tabindex="-1"></a>    dataloader_prefetch_factor<span class="op">=</span><span class="dv">8</span>,  <span class="co"># dataloader optimizations</span></span>
+<span id="cb4-60"><a href="#cb4-60" aria-hidden="true" tabindex="-1"></a>    dataloader_num_workers<span class="op">=</span><span class="dv">2</span>,</span>
+<span id="cb4-61"><a href="#cb4-61" aria-hidden="true" tabindex="-1"></a>    dataloader_pin_memory<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb4-62"><a href="#cb4-62" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb4-63"><a href="#cb4-63" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-64"><a href="#cb4-64" aria-hidden="true" tabindex="-1"></a><span class="co"># validates the configuration</span></span>
+<span id="cb4-65"><a href="#cb4-65" aria-hidden="true" tabindex="-1"></a>cfg <span class="op">=</span> load_cfg(config)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <div class="ansi-escaped-output">
 <pre>[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
@@ -673,8 +680,9 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </div>
 <div id="cell-10" class="cell">
 <div class="sourceCode cell-code" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> axolotl.utils <span class="im">import</span> patch_optimized_env</span>
-<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="co"># speedup downloads from HF 🤗 and set "PYTORCH_CUDA_ALLOC_CONF" env to save memory</span></span>
-<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>patch_optimized_env()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a><span class="co"># speedup downloads from HF 🤗 and set "PYTORCH_CUDA_ALLOC_CONF" env to save memory</span></span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>patch_optimized_env()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div>
 </section>
 <section id="datasets" class="level1">
@@ -1235,29 +1243,30 @@ You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokeni
 <section id="inferencing-the-trained-model" class="level1">
 <h1>Inferencing the trained model</h1>
 <div id="cell-16" class="cell" data-quarto-private-1="{&quot;key&quot;:&quot;colab&quot;,&quot;value&quot;:{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;}}" data-outputid="e5050605-f6c9-421c-98f9-bde56a281eae">
-<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">import</span> torch</span>
-<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> TextStreamer</span>
-<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>messages <span class="op">=</span> [</span>
-<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>    {</span>
-<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>        <span class="st">"role"</span>: <span class="st">"user"</span>,</span>
-<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>        <span class="st">"content"</span>: <span class="st">"Explain the Pythagorean theorem to me."</span>,</span>
-<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>    },</span>
-<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a>]</span>
-<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a>prompt <span class="op">=</span> tokenizer.apply_chat_template(</span>
-<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a>    messages,</span>
-<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a>    add_generation_prompt<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a>    tokenize<span class="op">=</span><span class="va">False</span>,</span>
-<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a>    enable_thinking <span class="op">=</span> <span class="va">False</span>,</span>
-<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a>)</span>
-<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a>outputs <span class="op">=</span> model.generate(</span>
-<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a>    <span class="op">**</span>tokenizer(prompt, return_tensors <span class="op">=</span> <span class="st">"pt"</span>).to(<span class="st">"cuda"</span>),</span>
-<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a>    max_new_tokens <span class="op">=</span> <span class="dv">192</span>,</span>
-<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a>    temperature <span class="op">=</span> <span class="fl">1.0</span>, top_p <span class="op">=</span> <span class="fl">0.8</span>, top_k <span class="op">=</span> <span class="dv">32</span>,</span>
-<span id="cb11-22"><a href="#cb11-22" aria-hidden="true" tabindex="-1"></a>    streamer <span class="op">=</span> TextStreamer(tokenizer, skip_prompt <span class="op">=</span> <span class="va">True</span>),</span>
-<span id="cb11-23"><a href="#cb11-23" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode cell-code" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> transformers <span class="im">import</span> TextStreamer</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>messages <span class="op">=</span> [</span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>    {</span>
+<span id="cb11-5"><a href="#cb11-5" aria-hidden="true" tabindex="-1"></a>        <span class="st">"role"</span>: <span class="st">"user"</span>,</span>
+<span id="cb11-6"><a href="#cb11-6" aria-hidden="true" tabindex="-1"></a>        <span class="st">"content"</span>: <span class="st">"Explain the Pythagorean theorem to me."</span>,</span>
+<span id="cb11-7"><a href="#cb11-7" aria-hidden="true" tabindex="-1"></a>    },</span>
+<span id="cb11-8"><a href="#cb11-8" aria-hidden="true" tabindex="-1"></a>]</span>
+<span id="cb11-9"><a href="#cb11-9" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-10"><a href="#cb11-10" aria-hidden="true" tabindex="-1"></a>prompt <span class="op">=</span> tokenizer.apply_chat_template(</span>
+<span id="cb11-11"><a href="#cb11-11" aria-hidden="true" tabindex="-1"></a>    messages,</span>
+<span id="cb11-12"><a href="#cb11-12" aria-hidden="true" tabindex="-1"></a>    add_generation_prompt<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb11-13"><a href="#cb11-13" aria-hidden="true" tabindex="-1"></a>    tokenize<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb11-14"><a href="#cb11-14" aria-hidden="true" tabindex="-1"></a>    enable_thinking<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb11-15"><a href="#cb11-15" aria-hidden="true" tabindex="-1"></a>)</span>
+<span id="cb11-16"><a href="#cb11-16" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb11-17"><a href="#cb11-17" aria-hidden="true" tabindex="-1"></a>outputs <span class="op">=</span> model.generate(</span>
+<span id="cb11-18"><a href="#cb11-18" aria-hidden="true" tabindex="-1"></a>    <span class="op">**</span>tokenizer(prompt, return_tensors<span class="op">=</span><span class="st">"pt"</span>).to(<span class="st">"cuda"</span>),</span>
+<span id="cb11-19"><a href="#cb11-19" aria-hidden="true" tabindex="-1"></a>    max_new_tokens<span class="op">=</span><span class="dv">192</span>,</span>
+<span id="cb11-20"><a href="#cb11-20" aria-hidden="true" tabindex="-1"></a>    temperature<span class="op">=</span><span class="fl">1.0</span>,</span>
+<span id="cb11-21"><a href="#cb11-21" aria-hidden="true" tabindex="-1"></a>    top_p<span class="op">=</span><span class="fl">0.8</span>,</span>
+<span id="cb11-22"><a href="#cb11-22" aria-hidden="true" tabindex="-1"></a>    top_k<span class="op">=</span><span class="dv">32</span>,</span>
+<span id="cb11-23"><a href="#cb11-23" aria-hidden="true" tabindex="-1"></a>    streamer<span class="op">=</span>TextStreamer(tokenizer, skip_prompt<span class="op">=</span><span class="va">True</span>),</span>
+<span id="cb11-24"><a href="#cb11-24" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-stdout">
 <pre><code>Ahoy there, matey! Shiver me timbers, ye be lookin' for the Pythagorean theorem, eh? Well, hold yer horses and listen up, for I'll be tellin' ye all about it in me own special way.
 
@@ -1295,14 +1304,15 @@ drwxr-xr-x 2 root root 4.0K May  7 22:21 checkpoint-25
 <p>If you prefer to manually upload the training artifacts, we can still upload the entire final checkpoint to HuggingFace from the CLI.</p>
 <div id="cell-20" class="cell" data-quarto-private-1="{&quot;key&quot;:&quot;colab&quot;,&quot;value&quot;:{&quot;base_uri&quot;:&quot;https://localhost:8080/&quot;,&quot;height&quot;:955,&quot;referenced_widgets&quot;:[&quot;c12ea43372ac4d57bb9605f1a429b397&quot;,&quot;86816687746246b4a6105e8010384e25&quot;,&quot;6f05e9bebf7b40c9835808e77de6c236&quot;,&quot;c7433acd3c4841e6958ae8f7e87b1808&quot;,&quot;19c1e38389fa46c7b7e2152a56e1df34&quot;,&quot;0e067d8db8ed48308a718d5f57683fd1&quot;,&quot;131065f118274a1586ac38e39ed84ef0&quot;,&quot;8640ac440fbc4644b9a3af7ba3ae7183&quot;,&quot;5cea7996f02040b187ece0bb2d6a8d1f&quot;,&quot;2e257c8be2da40b4bb67a9e4ab6811f3&quot;,&quot;56e3768bef5a4b9db4168c5c17f509c2&quot;,&quot;62c028fdef904dedb9cdeca2b3bda725&quot;,&quot;a7cf477e80fc43e0ad82c7997b076dce&quot;,&quot;835bcc28a5564fb9b3d651bc8e32dc46&quot;,&quot;9f1c9a0695384bdaa6f8b847ef89bee8&quot;,&quot;b1bea589efa14258a9982071b87938bf&quot;,&quot;590eef89881545aa8bbef9a8bbe7fb00&quot;,&quot;4b1f04ff63d14a118fdd15814dff50e4&quot;,&quot;39789237703c4a418134243055c9cbf5&quot;,&quot;a3a945817f684328b34651fe052393ec&quot;]}}" data-outputid="6e489ab2-4abe-4e28-84ca-959f912433a4">
 <div class="sourceCode cell-code" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a><span class="im">from</span> huggingface_hub <span class="im">import</span> notebook_login</span>
-<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a><span class="co"># remove the partial epoch checkpoints</span></span>
-<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>rm <span class="op">-</span>rf <span class="st">"./outputs/qwen-sft-pirate-rrr/checkpoint-*"</span></span>
-<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a><span class="co"># HF Notebook login widget</span></span>
-<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a>notebook_login()</span>
-<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a><span class="co"># upload the LoRA adapter for your model to HF, remember to update the username/model-name below</span></span>
-<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>huggingface<span class="op">-</span>cli upload <span class="op">--</span>repo<span class="op">-</span><span class="bu">type</span><span class="op">=</span>model winglian<span class="op">/</span>pirate<span class="op">-</span>qwen<span class="op">-</span><span class="dv">14</span><span class="er">B</span> <span class="st">"./outputs/qwen-sft-pirate-rrr"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb15-2"><a href="#cb15-2" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-3"><a href="#cb15-3" aria-hidden="true" tabindex="-1"></a><span class="co"># remove the partial epoch checkpoints</span></span>
+<span id="cb15-4"><a href="#cb15-4" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>rm <span class="op">-</span>rf <span class="st">"./outputs/qwen-sft-pirate-rrr/checkpoint-*"</span></span>
+<span id="cb15-5"><a href="#cb15-5" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-6"><a href="#cb15-6" aria-hidden="true" tabindex="-1"></a><span class="co"># HF Notebook login widget</span></span>
+<span id="cb15-7"><a href="#cb15-7" aria-hidden="true" tabindex="-1"></a>notebook_login()</span>
+<span id="cb15-8"><a href="#cb15-8" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb15-9"><a href="#cb15-9" aria-hidden="true" tabindex="-1"></a><span class="co"># upload the LoRA adapter for your model to HF, remember to update the username/model-name below</span></span>
+<span id="cb15-10"><a href="#cb15-10" aria-hidden="true" tabindex="-1"></a><span class="op">!</span>huggingface<span class="op">-</span>cli upload <span class="op">--</span>repo<span class="op">-</span><span class="bu">type</span><span class="op">=</span>model winglian<span class="op">/</span>pirate<span class="op">-</span>qwen<span class="op">-</span><span class="dv">14</span><span class="er">B</span> <span class="st">"./outputs/qwen-sft-pirate-rrr"</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <div class="cell-output cell-output-display">
 <script type="application/vnd.jupyter.widget-view+json">
 {"model_id":"c12ea43372ac4d57bb9605f1a429b397","version_major":2,"version_minor":0,"quarto_mimetype":"application/vnd.jupyter.widget-view+json"}
diff --git a/search.json b/search.json
index b239925ad..ed65f5099 100644
--- a/search.json
+++ b/search.json
@@ -1069,14 +1069,14 @@
     "href": "docs/api/index.html",
     "title": "API Reference",
     "section": "",
-    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nFunctionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
+    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nFunctionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
   },
   {
     "objectID": "docs/api/index.html#core",
     "href": "docs/api/index.html#core",
     "title": "API Reference",
     "section": "",
-    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat."
+    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the"
   },
   {
     "objectID": "docs/api/index.html#cli",
@@ -2316,7 +2316,7 @@
     "href": "examples/colab-notebooks/colab-axolotl-example.html#demo-talk-like-a-pirate",
     "title": "Fine-Tune Qwen3 14B with Axolotl",
     "section": "Demo: Talk Like a Pirate",
-    "text": "Demo: Talk Like a Pirate\nIn this demo, we are training the model to respond like a pirate. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab.\n\nUpload your own dataset or use a Huggingface dataset\nYou can choose to use your own JSONL file from your own Google Drive; for example downloading the Pirate-Ultrachat JSONL to your Google Drive. JSONL datasets should be formatted similar to the OpenAI dataset format.\nYou can also simply use the winglian/pirate-ultrachat-10k dataset directly.\n\n# Default to HF dataset location\ndataset_id = \"winglian/pirate-ultrachat-10k\"\nuploaded = {}\n\n\nimport os\n# Optionally, upload your own JSONL to your Google Drive\nGOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n\n# \"Select All\" permissions, or you may get the error:\n# \"MessageError: Error: credential propagation was unsuccessful\"\nif GOOGLE_DRIVE_PATH:\n    from google.colab import drive\n    # Mount your Google Drive\n    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n    # make sure file exists\n    if not os.path.isfile(tmp_path):\n        raise ValueError(f\"File {tmp_path} does not exist\")\n    dataset_id = tmp_path"
+    "text": "Demo: Talk Like a Pirate\nIn this demo, we are training the model to respond like a pirate. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab.\n\nUpload your own dataset or use a Huggingface dataset\nYou can choose to use your own JSONL file from your own Google Drive; for example downloading the Pirate-Ultrachat JSONL to your Google Drive. JSONL datasets should be formatted similar to the OpenAI dataset format.\nYou can also simply use the winglian/pirate-ultrachat-10k dataset directly.\n\n# Default to HF dataset location\ndataset_id = \"winglian/pirate-ultrachat-10k\"\nuploaded = {}\n\n\nimport os\n\n# Optionally, upload your own JSONL to your Google Drive\nGOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n\n# \"Select All\" permissions, or you may get the error:\n# \"MessageError: Error: credential propagation was unsuccessful\"\nif GOOGLE_DRIVE_PATH:\n    from google.colab import drive\n\n    # Mount your Google Drive\n    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n    # make sure file exists\n    if not os.path.isfile(tmp_path):\n        raise ValueError(f\"File {tmp_path} does not exist\")\n    dataset_id = tmp_path"
   },
   {
     "objectID": "FAQS.html",
@@ -3171,14 +3171,14 @@
     "href": "docs/api/core.datasets.transforms.chat_builder.html",
     "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=['role', 'from'],\n    message_field_content=['value', 'text', 'content'],\n    message_field_training=['train', 'weight'],\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\n['role', 'from']\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\n['value', 'text', 'content']\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\n['train', 'weight']\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\ndataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
     "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions",
     "href": "docs/api/core.datasets.transforms.chat_builder.html#functions",
     "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=['role', 'from'],\n    message_field_content=['value', 'text', 'content'],\n    message_field_training=['train', 'weight'],\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\n['role', 'from']\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\n['value', 'text', 'content']\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\n['train', 'weight']\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
     "objectID": "docs/api/core.trainers.base.html",
diff --git a/sitemap.xml b/sitemap.xml
index dd2292cc8..42b58799b 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,790 +2,790 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2025-08-22T18:29:19.174Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.321Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2025-08-22T18:29:19.178Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.326Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.303Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2025-08-22T18:29:19.158Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2025-08-22T18:29:19.158Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2025-08-22T18:29:19.158Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2025-08-22T18:32:33.192Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.421Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
-    <lastmod>2025-08-22T18:32:33.459Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.688Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
-    <lastmod>2025-08-22T18:32:32.820Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.053Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
-    <lastmod>2025-08-22T18:32:32.900Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.130Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-22T18:32:33.410Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.639Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2025-08-22T18:32:34.002Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.236Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2025-08-22T18:32:33.196Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.426Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2025-08-22T18:32:33.885Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.119Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2025-08-22T18:32:33.632Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.863Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2025-08-22T18:32:33.477Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.705Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2025-08-22T18:32:33.385Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.614Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2025-08-22T18:32:33.283Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.512Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
-    <lastmod>2025-08-22T18:32:32.946Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.176Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
-    <lastmod>2025-08-22T18:32:32.851Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.082Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2025-08-22T18:32:33.214Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.443Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2025-08-22T18:32:32.602Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.836Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.pretraining.html</loc>
-    <lastmod>2025-08-22T18:32:33.625Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.856Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2025-08-22T18:32:32.534Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.766Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-22T18:32:33.465Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.694Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2025-08-22T18:32:33.456Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.685Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2025-08-22T18:32:32.827Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.060Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2025-08-22T18:32:33.524Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.754Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
-    <lastmod>2025-08-22T18:32:32.692Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.926Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2025-08-22T18:32:33.218Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.447Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2025-08-22T18:32:33.062Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.293Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2025-08-22T18:32:33.207Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.436Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2025-08-22T18:32:33.073Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.303Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
-    <lastmod>2025-08-22T18:32:32.935Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.165Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2025-08-22T18:32:33.699Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.930Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2025-08-22T18:32:32.910Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.140Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
-    <lastmod>2025-08-22T18:32:32.929Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.160Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2025-08-22T18:32:34.011Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.245Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2025-08-22T18:32:34.007Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.241Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
-    <lastmod>2025-08-22T18:32:32.688Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.922Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2025-08-22T18:32:32.788Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.022Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2025-08-22T18:32:33.728Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.961Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2025-08-22T18:32:33.901Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.134Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2025-08-22T18:32:32.797Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.030Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2025-08-22T18:32:33.563Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.793Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2025-08-22T18:32:33.252Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.482Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2025-08-22T18:32:32.626Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.860Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2025-08-22T18:32:33.716Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.948Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
-    <lastmod>2025-08-22T18:32:33.055Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.286Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2025-08-22T18:32:33.681Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.912Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2025-08-22T18:32:33.667Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.898Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2025-08-22T18:32:33.262Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.491Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2025-08-22T18:32:33.287Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.516Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2025-08-22T18:32:32.907Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.137Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2025-08-22T18:32:33.159Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.388Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2025-08-22T18:32:32.874Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.105Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2025-08-22T18:32:33.096Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.327Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2025-08-22T18:32:33.147Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.376Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2025-08-22T18:32:33.538Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.768Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2025-08-22T18:32:33.922Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.156Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
-    <lastmod>2025-08-22T18:32:32.958Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.188Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2025-08-22T18:32:33.411Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.641Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2025-08-22T18:32:32.728Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.962Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2025-08-22T18:32:33.219Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.449Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2025-08-22T18:32:32.984Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.214Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2025-08-22T18:32:32.895Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.125Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2025-08-22T18:32:33.395Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.625Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2025-08-22T18:32:33.403Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.632Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2025-08-22T18:32:33.132Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.361Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2025-08-22T18:32:33.261Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.490Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2025-08-22T18:32:33.007Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.237Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-22T18:32:33.413Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.642Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2025-08-22T18:32:32.737Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.972Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2025-08-22T18:32:32.817Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.050Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2025-08-22T18:32:32.780Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.014Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2025-08-22T18:32:32.996Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.226Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2025-08-22T18:32:33.711Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.943Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2025-08-22T18:32:33.244Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.474Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2025-08-22T18:32:32.668Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.901Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2025-08-22T18:32:33.146Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.375Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2025-08-22T18:32:32.677Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.911Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2025-08-22T18:29:19.153Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/qat.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2025-08-22T18:29:19.162Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.310Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2025-08-22T18:29:19.152Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.300Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2025-08-22T18:29:19.153Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2025-08-22T18:32:33.616Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.847Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2025-08-22T18:32:33.186Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.415Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2025-08-22T18:32:33.925Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.158Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2025-08-22T18:32:32.865Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.097Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2025-08-22T18:32:33.546Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.775Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2025-08-22T18:32:33.019Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2025-08-22T18:32:33.066Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.296Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2025-08-22T18:32:32.918Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.148Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2025-08-22T18:32:32.732Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.967Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2025-08-22T18:32:34.004Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.238Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2025-08-22T18:32:33.952Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.186Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2025-08-22T18:32:33.021Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.251Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2025-08-22T18:32:33.624Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.854Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2025-08-22T18:32:33.882Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.116Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2025-08-22T18:32:33.243Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.472Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-22T18:32:33.458Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.687Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
-    <lastmod>2025-08-22T18:32:33.653Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.884Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
-    <lastmod>2025-08-22T18:32:34.017Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.252Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
-    <lastmod>2025-08-22T18:32:32.683Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.917Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
-    <lastmod>2025-08-22T18:32:33.490Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.719Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2025-08-22T18:32:33.894Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.128Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2025-08-22T18:32:33.897Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.131Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2025-08-22T18:32:33.947Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.181Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
-    <lastmod>2025-08-22T18:32:33.030Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.261Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2025-08-22T18:32:33.591Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.821Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2025-08-22T18:32:33.374Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.604Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2025-08-22T18:32:33.535Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.764Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2025-08-22T18:32:32.731Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.965Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2025-08-22T18:32:32.989Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.220Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2025-08-22T18:32:33.739Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.971Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2025-08-22T18:32:33.487Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.716Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2025-08-22T18:32:33.404Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.634Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2025-08-22T18:32:32.705Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.939Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2025-08-22T18:32:33.999Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.233Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2025-08-22T18:32:32.916Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.146Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
-    <lastmod>2025-08-22T18:32:32.940Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.171Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2025-08-22T18:32:32.592Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.825Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2025-08-22T18:32:33.887Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.120Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2025-08-22T18:32:33.992Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.226Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2025-08-22T18:32:33.203Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.432Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
-    <lastmod>2025-08-22T18:32:33.419Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.649Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2025-08-22T18:32:33.486Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.714Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2025-08-22T18:32:33.529Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.759Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
-    <lastmod>2025-08-22T18:32:33.039Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.269Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2025-08-22T18:32:32.729Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.964Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2025-08-22T18:32:33.944Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.177Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2025-08-22T18:32:32.886Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.117Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2025-08-22T18:32:33.180Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.409Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2025-08-22T18:32:33.523Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.752Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2025-08-22T18:32:33.905Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.139Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2025-08-22T18:32:32.745Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.980Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2025-08-22T18:32:32.969Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.199Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2025-08-22T18:32:33.448Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.677Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2025-08-22T18:32:33.744Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.977Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2025-08-22T18:32:33.230Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.459Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2025-08-22T18:32:32.846Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.078Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2025-08-22T18:32:33.707Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.939Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2025-08-22T18:32:33.167Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.397Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2025-08-22T18:32:33.241Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.471Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2025-08-22T18:32:33.414Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.644Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2025-08-22T18:32:33.098Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.328Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2025-08-22T18:32:33.923Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.157Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2025-08-22T18:32:33.418Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.647Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2025-08-22T18:32:33.907Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.141Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2025-08-22T18:32:33.469Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.697Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2025-08-22T18:32:33.611Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.841Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
-    <lastmod>2025-08-22T18:32:33.057Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.288Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
-    <lastmod>2025-08-22T18:32:33.516Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.745Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2025-08-22T18:32:32.613Z</lastmod>
+    <lastmod>2025-08-24T03:40:54.846Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2025-08-22T18:32:33.475Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.704Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
-    <lastmod>2025-08-22T18:32:33.040Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.271Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2025-08-22T18:32:33.904Z</lastmod>
+    <lastmod>2025-08-24T03:40:56.138Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
-    <lastmod>2025-08-22T18:32:33.046Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.276Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2025-08-22T18:32:33.674Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.905Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2025-08-22T18:32:33.240Z</lastmod>
+    <lastmod>2025-08-24T03:40:55.469Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2025-08-22T18:29:19.153Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.305Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2025-08-22T18:29:19.158Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/config-reference.html</loc>
-    <lastmod>2025-08-22T18:32:49.071Z</lastmod>
+    <lastmod>2025-08-24T03:41:09.823Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2025-08-22T18:29:19.153Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2025-08-22T18:29:19.153Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2025-08-22T18:29:19.153Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.302Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2025-08-22T18:29:19.154Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.303Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2025-08-22T18:29:19.157Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.306Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2025-08-22T18:29:19.178Z</lastmod>
+    <lastmod>2025-08-24T03:37:40.326Z</lastmod>
   </url>
 </urlset>

message_field_role	str \| list[str]	The field name of the role. Defaults to “role”.	`['role', 'from']`	`None`
message_field_content	str \| list[str]	The field name of the message content. Defaults to “content”.	`['value', 'text', 'content']`	`None`
message_field_training	str \| list[str]	The field name of the train/weight. Defaults to “weight”.	`['train', 'weight']`	`None`