diff --git a/.nojekyll b/.nojekyll
index 50e4ddf8e..7052830ab 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-d0e306c9
\ No newline at end of file
+463de0cb
\ No newline at end of file
diff --git a/docs/api/kernels.lora.html b/docs/api/kernels.lora.html
index e858b9c75..afeaeefac 100644
--- a/docs/api/kernels.lora.html
+++ b/docs/api/kernels.lora.html
@@ -639,11 +639,11 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <tr class="odd">
 <td></td>
 <td>None</td>
-<td>- <code>None</code> for weights/quantization states</td>
+<td>- <code>None</code> for weights/biases/quantization states</td>
 </tr>
 <tr class="even">
 <td></td>
-<td>torch.Tensor | None</td>
+<td>None</td>
 <td>- LoRA A/B matrix gradients (or <code>None</code>)</td>
 </tr>
 <tr class="odd">
@@ -653,7 +653,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </tr>
 <tr class="even">
 <td></td>
-<td>None</td>
+<td>torch.Tensor | None</td>
 <td>- <code>None</code> for activation functions and flags</td>
 </tr>
 </tbody>
@@ -666,24 +666,27 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    ctx,</span>
 <span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    X,</span>
 <span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>    gate_weight,</span>
-<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>    gate_quant,</span>
-<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    gate_A,</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    gate_B,</span>
-<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    gate_scale,</span>
-<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    up_weight,</span>
-<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    up_quant,</span>
-<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    up_A,</span>
-<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>    up_B,</span>
-<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    up_scale,</span>
-<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    down_weight,</span>
-<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>    down_quant,</span>
-<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>    down_A,</span>
-<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>    down_B,</span>
-<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>    down_scale,</span>
-<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a>    activation_fn,</span>
-<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a>    activation_fn_backward,</span>
-<span id="cb3-21"><a href="#cb3-21" aria-hidden="true" tabindex="-1"></a>    inplace<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb3-22"><a href="#cb3-22" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>    gate_bias,</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    gate_quant,</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    gate_A,</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    gate_B,</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>    gate_scale,</span>
+<span id="cb3-10"><a href="#cb3-10" aria-hidden="true" tabindex="-1"></a>    up_weight,</span>
+<span id="cb3-11"><a href="#cb3-11" aria-hidden="true" tabindex="-1"></a>    up_bias,</span>
+<span id="cb3-12"><a href="#cb3-12" aria-hidden="true" tabindex="-1"></a>    up_quant,</span>
+<span id="cb3-13"><a href="#cb3-13" aria-hidden="true" tabindex="-1"></a>    up_A,</span>
+<span id="cb3-14"><a href="#cb3-14" aria-hidden="true" tabindex="-1"></a>    up_B,</span>
+<span id="cb3-15"><a href="#cb3-15" aria-hidden="true" tabindex="-1"></a>    up_scale,</span>
+<span id="cb3-16"><a href="#cb3-16" aria-hidden="true" tabindex="-1"></a>    down_weight,</span>
+<span id="cb3-17"><a href="#cb3-17" aria-hidden="true" tabindex="-1"></a>    down_bias,</span>
+<span id="cb3-18"><a href="#cb3-18" aria-hidden="true" tabindex="-1"></a>    down_quant,</span>
+<span id="cb3-19"><a href="#cb3-19" aria-hidden="true" tabindex="-1"></a>    down_A,</span>
+<span id="cb3-20"><a href="#cb3-20" aria-hidden="true" tabindex="-1"></a>    down_B,</span>
+<span id="cb3-21"><a href="#cb3-21" aria-hidden="true" tabindex="-1"></a>    down_scale,</span>
+<span id="cb3-22"><a href="#cb3-22" aria-hidden="true" tabindex="-1"></a>    activation_fn,</span>
+<span id="cb3-23"><a href="#cb3-23" aria-hidden="true" tabindex="-1"></a>    activation_fn_backward,</span>
+<span id="cb3-24"><a href="#cb3-24" aria-hidden="true" tabindex="-1"></a>    inplace<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb3-25"><a href="#cb3-25" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Forward pass for LoRA MLP.</p>
 <section id="parameters-1" class="level6 doc-section doc-section-parameters">
 <h6 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters-1">Parameters</h6>
@@ -722,87 +725,99 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <td><em>required</em></td>
 </tr>
 <tr class="even">
-<td>gate_quant</td>
-<td>object | None</td>
-<td>Gate quantization state</td>
+<td>gate_bias</td>
+<td>torch.Tensor | None</td>
+<td>Gate projection bias</td>
 <td><em>required</em></td>
 </tr>
 <tr class="odd">
+<td>gate_quant</td>
+<td>QuantState | None</td>
+<td>Gate quantization state</td>
+<td><em>required</em></td>
+</tr>
+<tr class="even">
 <td>gate_A</td>
 <td>torch.Tensor | None</td>
 <td>Gate LoRA A matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td>gate_B</td>
 <td>torch.Tensor | None</td>
 <td>Gate LoRA B matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td>gate_scale</td>
 <td>float</td>
 <td>Gate LoRA scale</td>
 <td><em>required</em></td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td>up_weight</td>
 <td>torch.Tensor</td>
-<td>Up-projection weight</td>
-<td><em>required</em></td>
-</tr>
-<tr class="odd">
-<td>up_quant</td>
-<td>object | None</td>
-<td>Up-projection quantization state</td>
+<td>Up projection weight</td>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
+<td>up_quant</td>
+<td>QuantState | None</td>
+<td>Up projection quantization state</td>
+<td><em>required</em></td>
+</tr>
+<tr class="odd">
 <td>up_A</td>
 <td>torch.Tensor | None</td>
-<td>Up-projection LoRA A matrix</td>
-<td><em>required</em></td>
-</tr>
-<tr class="odd">
-<td>up_B</td>
-<td>torch.Tensor | None</td>
-<td>Up-projection LoRA B matrix</td>
+<td>Up projection LoRA A matrix</td>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
-<td>up_scale</td>
-<td>float</td>
-<td>Up-projection LoRA scale</td>
+<td>up_B</td>
+<td>torch.Tensor | None</td>
+<td>Up projection LoRA B matrix</td>
 <td><em>required</em></td>
 </tr>
 <tr class="odd">
+<td>up_scale</td>
+<td>float</td>
+<td>Up projection LoRA scale</td>
+<td><em>required</em></td>
+</tr>
+<tr class="even">
 <td>down_weight</td>
 <td>torch.Tensor</td>
-<td>Down-projection weight</td>
+<td>Down projection weight</td>
+<td><em>required</em></td>
+</tr>
+<tr class="odd">
+<td>down_bias</td>
+<td>torch.Tensor | None</td>
+<td>Down projection bias</td>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
 <td>down_quant</td>
-<td>object | None</td>
-<td>Down-projection quantization state</td>
+<td>QuantState | None</td>
+<td>Down projection quantization state</td>
 <td><em>required</em></td>
 </tr>
 <tr class="odd">
 <td>down_A</td>
 <td>torch.Tensor | None</td>
-<td>Down-projection LoRA A matrix</td>
+<td>Down projection LoRA A matrix</td>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
 <td>down_B</td>
 <td>torch.Tensor | None</td>
-<td>Down-projection LoRA B matrix</td>
+<td>Down projection LoRA B matrix</td>
 <td><em>required</em></td>
 </tr>
 <tr class="odd">
 <td>down_scale</td>
 <td>float</td>
-<td>Down-projection LoRA scale</td>
+<td>Down projection LoRA scale</td>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
@@ -919,8 +934,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <table class="caption-top table">
 <colgroup>
 <col style="width: 5%">
-<col style="width: 59%">
-<col style="width: 35%">
+<col style="width: 55%">
+<col style="width: 38%">
 </colgroup>
 <thead>
 <tr class="header">
@@ -932,7 +947,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <tbody>
 <tr class="odd">
 <td></td>
-<td>tuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None]</td>
+<td>tuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]</td>
 <td>Tuple containing gradients for all forward inputs</td>
 </tr>
 </tbody>
@@ -941,7 +956,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </section>
 <section id="axolotl.kernels.lora.LoRA_O.forward" class="level5">
 <h5 class="anchored" data-anchor-id="axolotl.kernels.lora.LoRA_O.forward">forward</h5>
-<div class="sourceCode" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>kernels.lora.LoRA_O.forward(ctx, X, W, W_quant, A, B, S)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>kernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Forward pass for output projection with LoRA.</p>
 <section id="parameters-3" class="level6 doc-section doc-section-parameters">
 <h6 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters-3">Parameters</h6>
@@ -980,25 +995,31 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <td><em>required</em></td>
 </tr>
 <tr class="even">
+<td>b</td>
+<td>torch.Tensor</td>
+<td>Output projection bias</td>
+<td><em>required</em></td>
+</tr>
+<tr class="odd">
 <td>W_quant</td>
 <td>QuantState | None</td>
 <td>Weight quantization state</td>
 <td><em>required</em></td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td>A</td>
-<td>torch.Tensor | None</td>
+<td>torch.Tensor</td>
 <td>LoRA A matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td>B</td>
-<td>torch.Tensor | None</td>
+<td>torch.Tensor</td>
 <td>LoRA B matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="odd">
-<td>S</td>
+<tr class="even">
+<td>s</td>
 <td>float</td>
 <td>LoRA scaling factor</td>
 <td><em>required</em></td>
@@ -1020,7 +1041,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <tr class="odd">
 <td></td>
 <td>torch.Tensor</td>
-<td>Output projection tensor</td>
+<td>Output projection result</td>
 </tr>
 </tbody>
 </table>
@@ -1108,8 +1129,8 @@ supporting quantization and memory optimization.</p>
 <table class="caption-top table">
 <colgroup>
 <col style="width: 2%">
-<col style="width: 78%">
-<col style="width: 18%">
+<col style="width: 79%">
+<col style="width: 17%">
 </colgroup>
 <thead>
 <tr class="header">
@@ -1121,7 +1142,7 @@ supporting quantization and memory optimization.</p>
 <tbody>
 <tr class="odd">
 <td></td>
-<td>tuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]</td>
+<td>tuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]</td>
 <td>Tuple containing gradients for all forward inputs</td>
 </tr>
 </tbody>
@@ -1134,22 +1155,25 @@ supporting quantization and memory optimization.</p>
 <span id="cb9-2"><a href="#cb9-2" aria-hidden="true" tabindex="-1"></a>    ctx,</span>
 <span id="cb9-3"><a href="#cb9-3" aria-hidden="true" tabindex="-1"></a>    X,</span>
 <span id="cb9-4"><a href="#cb9-4" aria-hidden="true" tabindex="-1"></a>    q_weight,</span>
-<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>    q_quant,</span>
-<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>    q_A,</span>
-<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>    q_B,</span>
-<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>    q_scale,</span>
-<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>    k_weight,</span>
-<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>    k_quant,</span>
-<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a>    k_A,</span>
-<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a>    k_B,</span>
-<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a>    k_scale,</span>
-<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a>    v_weight,</span>
-<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a>    v_quant,</span>
-<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a>    v_A,</span>
-<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a>    v_B,</span>
-<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a>    v_scale,</span>
-<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a>    inplace<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb9-5"><a href="#cb9-5" aria-hidden="true" tabindex="-1"></a>    q_bias,</span>
+<span id="cb9-6"><a href="#cb9-6" aria-hidden="true" tabindex="-1"></a>    q_quant,</span>
+<span id="cb9-7"><a href="#cb9-7" aria-hidden="true" tabindex="-1"></a>    q_A,</span>
+<span id="cb9-8"><a href="#cb9-8" aria-hidden="true" tabindex="-1"></a>    q_B,</span>
+<span id="cb9-9"><a href="#cb9-9" aria-hidden="true" tabindex="-1"></a>    q_scale,</span>
+<span id="cb9-10"><a href="#cb9-10" aria-hidden="true" tabindex="-1"></a>    k_weight,</span>
+<span id="cb9-11"><a href="#cb9-11" aria-hidden="true" tabindex="-1"></a>    k_bias,</span>
+<span id="cb9-12"><a href="#cb9-12" aria-hidden="true" tabindex="-1"></a>    k_quant,</span>
+<span id="cb9-13"><a href="#cb9-13" aria-hidden="true" tabindex="-1"></a>    k_A,</span>
+<span id="cb9-14"><a href="#cb9-14" aria-hidden="true" tabindex="-1"></a>    k_B,</span>
+<span id="cb9-15"><a href="#cb9-15" aria-hidden="true" tabindex="-1"></a>    k_scale,</span>
+<span id="cb9-16"><a href="#cb9-16" aria-hidden="true" tabindex="-1"></a>    v_weight,</span>
+<span id="cb9-17"><a href="#cb9-17" aria-hidden="true" tabindex="-1"></a>    v_bias,</span>
+<span id="cb9-18"><a href="#cb9-18" aria-hidden="true" tabindex="-1"></a>    v_quant,</span>
+<span id="cb9-19"><a href="#cb9-19" aria-hidden="true" tabindex="-1"></a>    v_A,</span>
+<span id="cb9-20"><a href="#cb9-20" aria-hidden="true" tabindex="-1"></a>    v_B,</span>
+<span id="cb9-21"><a href="#cb9-21" aria-hidden="true" tabindex="-1"></a>    v_scale,</span>
+<span id="cb9-22"><a href="#cb9-22" aria-hidden="true" tabindex="-1"></a>    inplace<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb9-23"><a href="#cb9-23" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Forward pass computing Q, K, V projections with LoRA.</p>
 <section id="parameters-5" class="level6 doc-section doc-section-parameters">
 <h6 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters-5">Parameters</h6>
@@ -1188,35 +1212,47 @@ supporting quantization and memory optimization.</p>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
+<td>q_bias</td>
+<td>torch.Tensor | None</td>
+<td>Query projection bias</td>
+<td><em>required</em></td>
+</tr>
+<tr class="odd">
 <td>q_quant</td>
 <td>QuantState | None</td>
 <td>Query quantization state</td>
 <td><em>required</em></td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td>q_A</td>
 <td>torch.Tensor | None</td>
 <td>Query LoRA A matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td>q_B</td>
 <td>torch.Tensor | None</td>
 <td>Query LoRA B matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td>q_scale</td>
 <td>float</td>
 <td>Query LoRA scale</td>
 <td><em>required</em></td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td>k_weight</td>
 <td>torch.Tensor</td>
 <td>Key projection weight</td>
 <td><em>required</em></td>
 </tr>
+<tr class="even">
+<td>k_bias</td>
+<td>torch.Tensor | None</td>
+<td>Key projection bias</td>
+<td><em>required</em></td>
+</tr>
 <tr class="odd">
 <td>k_quant</td>
 <td>QuantState | None</td>
@@ -1248,30 +1284,36 @@ supporting quantization and memory optimization.</p>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
+<td>v_bias</td>
+<td>torch.Tensor | None</td>
+<td>Value projection bias</td>
+<td><em>required</em></td>
+</tr>
+<tr class="odd">
 <td>v_quant</td>
 <td>QuantState | None</td>
 <td>Value quantization state</td>
 <td><em>required</em></td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td>v_A</td>
 <td>torch.Tensor | None</td>
 <td>Value LoRA A matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td>v_B</td>
 <td>torch.Tensor | None</td>
 <td>Value LoRA B matrix</td>
 <td><em>required</em></td>
 </tr>
-<tr class="odd">
+<tr class="even">
 <td>v_scale</td>
 <td>float</td>
 <td>Value LoRA scale</td>
 <td><em>required</em></td>
 </tr>
-<tr class="even">
+<tr class="odd">
 <td>inplace</td>
 <td>bool</td>
 <td>Whether to perform operations in-place</td>
@@ -1625,17 +1667,17 @@ supporting quantization and memory optimization.</p>
 <tr class="odd">
 <td></td>
 <td>torch.Tensor</td>
-<td>A tuple containing the base weight matrix, quantization state, LoRA A matrix,</td>
+<td>A tuple containing the base weights, quantization state, LoRA A and B weights,</td>
 </tr>
 <tr class="even">
 <td></td>
-<td>QuantState | None</td>
-<td>LoRA B matrix, and scaling factor. States and matrices may be None if not</td>
+<td>torch.Tensor | None</td>
+<td>scaling factor, and base layer bias. Quant state, weights, and bias may be</td>
 </tr>
 <tr class="odd">
 <td></td>
-<td>torch.Tensor | None</td>
-<td>available.</td>
+<td>QuantState | None</td>
+<td><code>None</code> if not available.</td>
 </tr>
 </tbody>
 </table>
@@ -1643,7 +1685,7 @@ supporting quantization and memory optimization.</p>
 </section>
 <section id="axolotl.kernels.lora.matmul_lora" class="level3">
 <h3 class="anchored" data-anchor-id="axolotl.kernels.lora.matmul_lora">matmul_lora</h3>
-<div class="sourceCode" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>kernels.lora.matmul_lora(X, W, W_quant, A, B, s, out<span class="op">=</span><span class="va">None</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb15"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb15-1"><a href="#cb15-1" aria-hidden="true" tabindex="-1"></a>kernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out<span class="op">=</span><span class="va">None</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Efficient fused matmul + LoRA computation.</p>
 <section id="parameters-11" class="level4 doc-section doc-section-parameters">
 <h4 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters-11">Parameters</h4>
@@ -1677,25 +1719,25 @@ supporting quantization and memory optimization.</p>
 </tr>
 <tr class="odd">
 <td>W_quant</td>
-<td>QuantState</td>
+<td>QuantState | None</td>
 <td>Quantization state for W</td>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
 <td>A</td>
-<td>torch.Tensor</td>
+<td>torch.Tensor | None</td>
 <td>LoRA A matrix [rank, in_features]</td>
 <td><em>required</em></td>
 </tr>
 <tr class="odd">
 <td>B</td>
-<td>torch.Tensor</td>
+<td>torch.Tensor | None</td>
 <td>LoRA B matrix [out_features, rank]</td>
 <td><em>required</em></td>
 </tr>
 <tr class="even">
 <td>s</td>
-<td>float</td>
+<td>float | None</td>
 <td>LoRA scaling factor</td>
 <td><em>required</em></td>
 </tr>
diff --git a/search.json b/search.json
index 5e8dd582c..1b1fa6afc 100644
--- a/search.json
+++ b/search.json
@@ -3424,21 +3424,21 @@
     "href": "docs/api/kernels.lora.html",
     "title": "kernels.lora",
     "section": "",
-    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/quantization states\n\n\n\ntorch.Tensor | None\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\nNone\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_quant\nobject | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp-projection weight\nrequired\n\n\nup_quant\nobject | None\nUp-projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp-projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp-projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp-projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown-projection weight\nrequired\n\n\ndown_quant\nobject | None\nDown-projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown-projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown-projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown-projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, W_quant, A, B, S)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix\nrequired\n\n\nS\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection tensor\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weight matrix, quantization state, LoRA A matrix,\n\n\n\nQuantState | None\nLoRA B matrix, and scaling factor. States and matrices may be None if not\n\n\n\ntorch.Tensor | None\navailable.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
+    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
   },
   {
     "objectID": "docs/api/kernels.lora.html#classes",
     "href": "docs/api/kernels.lora.html#classes",
     "title": "kernels.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/quantization states\n\n\n\ntorch.Tensor | None\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\nNone\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_quant\nobject | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp-projection weight\nrequired\n\n\nup_quant\nobject | None\nUp-projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp-projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp-projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp-projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown-projection weight\nrequired\n\n\ndown_quant\nobject | None\nDown-projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown-projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown-projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown-projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, W_quant, A, B, S)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix\nrequired\n\n\nS\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection tensor\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors"
+    "text": "Name\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors"
   },
   {
     "objectID": "docs/api/kernels.lora.html#functions",
     "href": "docs/api/kernels.lora.html#functions",
     "title": "kernels.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weight matrix, quantization state, LoRA A matrix,\n\n\n\nQuantState | None\nLoRA B matrix, and scaling factor. States and matrices may be None if not\n\n\n\ntorch.Tensor | None\navailable.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
+    "text": "Name\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
   },
   {
     "objectID": "docs/api/cli.vllm_serve.html",
diff --git a/sitemap.xml b/sitemap.xml
index 96f9fb626..5c5839075 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,794 +2,794 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/TODO.html</loc>
-    <lastmod>2025-08-06T18:29:01.128Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.795Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2025-08-06T18:29:01.149Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.816Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2025-08-06T18:29:01.129Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.796Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2025-08-06T18:32:31.939Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.931Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
-    <lastmod>2025-08-06T18:32:31.354Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.357Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
-    <lastmod>2025-08-06T18:32:30.993Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.000Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2025-08-06T18:32:30.668Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.687Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2025-08-06T18:32:30.727Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.743Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2025-08-06T18:32:31.930Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.922Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2025-08-06T18:32:30.665Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.684Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2025-08-06T18:32:31.935Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.928Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2025-08-06T18:32:30.823Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.836Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2025-08-06T18:32:31.414Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.415Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2025-08-06T18:32:31.452Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.452Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2025-08-06T18:32:30.670Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.689Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2025-08-06T18:32:31.000Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.007Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2025-08-06T18:32:31.878Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.871Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2025-08-06T18:32:30.613Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.633Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2025-08-06T18:32:31.882Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.875Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2025-08-06T18:32:31.122Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.126Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2025-08-06T18:32:31.339Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.342Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2025-08-06T18:32:31.156Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.160Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2025-08-06T18:32:31.330Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.333Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2025-08-06T18:32:31.838Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.831Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2025-08-06T18:32:30.847Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.859Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2025-08-06T18:32:31.220Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.222Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
-    <lastmod>2025-08-06T18:32:30.629Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.649Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2025-08-06T18:32:31.544Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.543Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2025-08-06T18:32:31.657Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.654Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2025-08-06T18:32:30.957Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.966Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2025-08-06T18:32:31.403Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.405Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2025-08-06T18:32:30.735Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.751Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
-    <lastmod>2025-08-06T18:32:30.625Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.644Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2025-08-06T18:32:31.349Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.352Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
-    <lastmod>2025-08-06T18:32:31.394Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.396Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
-    <lastmod>2025-08-06T18:32:30.788Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.802Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2025-08-06T18:32:31.640Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.637Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2025-08-06T18:32:31.178Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.181Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2025-08-06T18:32:31.825Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.818Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
-    <lastmod>2025-08-06T18:32:31.443Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.444Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2025-08-06T18:32:31.552Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.551Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2025-08-06T18:32:31.412Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.414Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2025-08-06T18:32:30.854Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.865Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2025-08-06T18:32:31.082Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.087Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2025-08-06T18:32:31.474Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.474Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2025-08-06T18:32:31.224Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.225Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2025-08-06T18:32:31.812Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.806Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2025-08-06T18:32:31.411Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.413Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2025-08-06T18:32:31.198Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.200Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2025-08-06T18:32:30.718Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.735Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2025-08-06T18:32:31.853Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.846Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2025-08-06T18:32:30.527Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.544Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2025-08-06T18:32:31.491Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.491Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2025-08-06T18:32:31.116Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.121Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2025-08-06T18:32:30.464Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.482Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2025-08-06T18:32:31.068Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.073Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2025-08-06T18:32:30.642Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.662Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2025-08-06T18:32:31.337Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.340Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2025-08-06T18:32:30.562Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.579Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2025-08-06T18:32:31.817Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.811Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2025-08-06T18:32:31.133Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.137Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2025-08-06T18:32:31.603Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.600Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
-    <lastmod>2025-08-06T18:32:31.949Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.941Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
-    <lastmod>2025-08-06T18:32:30.994Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.002Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
-    <lastmod>2025-08-06T18:32:30.884Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.894Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2025-08-06T18:32:31.167Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.170Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2025-08-06T18:32:30.683Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.701Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
-    <lastmod>2025-08-06T18:32:30.872Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.883Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2025-08-06T18:32:30.926Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.935Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2025-08-06T18:32:31.668Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.664Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2025-08-06T18:32:31.934Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.926Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2025-08-06T18:32:31.140Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.144Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2025-08-06T18:32:30.920Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.930Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2025-08-06T18:32:31.144Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.147Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2025-08-06T18:32:31.924Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.916Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2025-08-06T18:32:31.518Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.518Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2025-08-06T18:32:30.943Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.953Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2025-08-06T18:32:30.604Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.624Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/config-reference.html</loc>
-    <lastmod>2025-08-06T18:32:46.257Z</lastmod>
+    <lastmod>2025-08-07T00:23:46.075Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2025-08-06T18:29:01.154Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.820Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2025-08-06T18:29:01.154Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.821Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/qat.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2025-08-06T18:29:01.129Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2025-08-06T18:29:01.130Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.800Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
-    <lastmod>2025-08-06T18:29:01.134Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2025-08-06T18:29:01.129Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2025-08-06T18:29:01.129Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.796Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
-    <lastmod>2025-08-06T18:32:30.894Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.905Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
-    <lastmod>2025-08-06T18:32:30.758Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.773Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2025-08-06T18:32:30.956Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.965Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
-    <lastmod>2025-08-06T18:32:30.967Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.976Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2025-08-06T18:32:30.832Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.844Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2025-08-06T18:32:30.855Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.866Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2025-08-06T18:32:30.802Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.816Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-06T18:32:31.392Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.394Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2025-08-06T18:32:30.549Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.565Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2025-08-06T18:32:31.409Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.411Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-06T18:32:31.344Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.347Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2025-08-06T18:32:31.352Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.355Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-06T18:32:31.399Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.401Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
-    <lastmod>2025-08-06T18:32:30.983Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.990Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2025-08-06T18:32:30.932Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.942Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2025-08-06T18:32:31.816Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.810Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2025-08-06T18:32:31.391Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.393Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
-    <lastmod>2025-08-06T18:32:30.977Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.985Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2025-08-06T18:32:30.783Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.797Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2025-08-06T18:32:31.828Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.822Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
-    <lastmod>2025-08-06T18:32:30.976Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.984Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2025-08-06T18:32:31.596Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.593Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2025-08-06T18:32:31.034Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.040Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2025-08-06T18:32:31.010Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.017Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2025-08-06T18:32:30.905Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.915Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
-    <lastmod>2025-08-06T18:32:30.867Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.878Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2025-08-06T18:32:31.155Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.158Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2025-08-06T18:32:31.383Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.385Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2025-08-06T18:32:31.309Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.312Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2025-08-06T18:32:30.844Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.856Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2025-08-06T18:32:31.645Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.642Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2025-08-06T18:32:31.674Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.670Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2025-08-06T18:32:31.346Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.349Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2025-08-06T18:32:31.831Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.825Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-06T18:32:31.347Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.350Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2025-08-06T18:32:31.855Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.849Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2025-08-06T18:32:30.667Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.686Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2025-08-06T18:32:31.181Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.184Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2025-08-06T18:32:30.675Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.694Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2025-08-06T18:32:31.466Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.466Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2025-08-06T18:32:31.610Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.607Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2025-08-06T18:32:31.874Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.867Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2025-08-06T18:32:31.129Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.132Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2025-08-06T18:32:31.457Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.457Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2025-08-06T18:32:31.035Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.042Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2025-08-06T18:32:31.095Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.100Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2025-08-06T18:32:31.628Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.625Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2025-08-06T18:32:31.180Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.183Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2025-08-06T18:32:31.637Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.633Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2025-08-06T18:32:31.150Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.154Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2025-08-06T18:32:31.836Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.830Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
-    <lastmod>2025-08-06T18:32:31.417Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.419Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2025-08-06T18:32:31.942Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.935Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2025-08-06T18:32:31.835Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.828Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
-    <lastmod>2025-08-06T18:32:30.837Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.849Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2025-08-06T18:32:30.764Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.780Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2025-08-06T18:32:31.190Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.192Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2025-08-06T18:32:31.462Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.463Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
-    <lastmod>2025-08-06T18:32:31.581Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.579Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2025-08-06T18:32:31.003Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.010Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2025-08-06T18:32:31.319Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.323Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.pretraining.html</loc>
-    <lastmod>2025-08-06T18:32:31.553Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.552Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2025-08-06T18:32:31.199Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.202Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
-    <lastmod>2025-08-06T18:32:30.620Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.640Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2025-08-06T18:32:30.811Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.824Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
-    <lastmod>2025-08-06T18:32:30.878Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.889Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2025-08-06T18:32:31.560Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.559Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2025-08-06T18:32:31.103Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.108Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2025-08-06T18:32:31.450Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.451Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2025-08-06T18:32:31.177Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.180Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2025-08-06T18:32:31.854Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.847Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2025-08-06T18:32:30.755Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.770Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2025-08-06T18:32:30.537Z</lastmod>
+    <lastmod>2025-08-07T00:23:31.554Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2025-08-06T18:32:31.083Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.088Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2025-08-06T18:32:31.539Z</lastmod>
+    <lastmod>2025-08-07T00:23:32.538Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2025-08-06T18:29:01.133Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.801Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2025-08-06T18:29:01.138Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.805Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2025-08-06T18:29:01.128Z</lastmod>
+    <lastmod>2025-08-07T00:20:15.795Z</lastmod>
   </url>
 </urlset>

	tuple[torch.Tensor, None, None, torch.Tensor \| None, torch.Tensor \| None, None]	tuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]	Tuple containing gradients for all forward inputs
required
b	torch.Tensor	Output projection bias	required
W_quant	QuantState \| None	Weight quantization state	required
A	torch.Tensor \| None	torch.Tensor	LoRA A matrix	required
B	torch.Tensor \| None	torch.Tensor	LoRA B matrix	required
S
s	float	LoRA scaling factor	required
	torch.Tensor	Output projection tensor	Output projection result