From 71128160d56cbb7dd8ac76f31ef792c6d2517521 Mon Sep 17 00:00:00 2001
From: Quarto GHA Workflow Runner <quarto-github-actions-publish@example.com>
Date: Tue, 13 May 2025 20:42:00 +0000
Subject: [PATCH] Built site for gh-pages

---
 .nojekyll                                     |    2 +-
 docs/api/cli.args.html                        |    3 +-
 docs/api/index.html                           |    8 +-
 ...s.gradient_checkpointing.offload_cpu.html} |   22 +-
 ...s.gradient_checkpointing.offload_disk.html | 1046 ++++++
 docs/config.html                              |    2 +-
 search.json                                   | 3296 +++++++++--------
 sitemap.xml                                   | 1306 +++----
 8 files changed, 3377 insertions(+), 2308 deletions(-)
 rename docs/api/{utils.gradient_checkpointing.unsloth.html => utils.gradient_checkpointing.offload_cpu.html} (96%)
 create mode 100644 docs/api/utils.gradient_checkpointing.offload_disk.html
diff --git a/.nojekyll b/.nojekyll
index f82194578..06a2f5738 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-6229eb51
\ No newline at end of file
+21d9bf7f
\ No newline at end of file
diff --git a/docs/api/cli.args.html b/docs/api/cli.args.html
index 6cac98ce3..52e78260a 100644
--- a/docs/api/cli.args.html
+++ b/docs/api/cli.args.html
@@ -556,7 +556,8 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 <span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a>    dtype<span class="op">=</span><span class="va">None</span>,</span>
 <span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a>    max_model_len<span class="op">=</span><span class="va">None</span>,</span>
 <span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a>    enable_prefix_caching<span class="op">=</span><span class="va">None</span>,</span>
-<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a>    serve_module<span class="op">=</span><span class="va">None</span>,</span>
+<span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Dataclass with CLI arguments for <code>axolotl vllm-serve</code> command.</p>
 
 
diff --git a/docs/api/index.html b/docs/api/index.html
index 91e262fa1..519a05307 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -926,8 +926,12 @@ ul.task-list li input[type="checkbox"] {
 <td>data handling specific to SFT</td>
 </tr>
 <tr class="even">
-<td><a href="../../docs/api/utils.gradient_checkpointing.unsloth.html#axolotl.utils.gradient_checkpointing.unsloth">utils.gradient_checkpointing.unsloth</a></td>
-<td>Unsloth checkpointing</td>
+<td><a href="../../docs/api/utils.gradient_checkpointing.offload_cpu.html#axolotl.utils.gradient_checkpointing.offload_cpu">utils.gradient_checkpointing.offload_cpu</a></td>
+<td>CPU offloaded checkpointing</td>
+</tr>
+<tr class="odd">
+<td><a href="../../docs/api/utils.gradient_checkpointing.offload_disk.html#axolotl.utils.gradient_checkpointing.offload_disk">utils.gradient_checkpointing.offload_disk</a></td>
+<td>DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching</td>
 </tr>
 </tbody>
 </table>
diff --git a/docs/api/utils.gradient_checkpointing.unsloth.html b/docs/api/utils.gradient_checkpointing.offload_cpu.html
similarity index 96%
rename from docs/api/utils.gradient_checkpointing.unsloth.html
rename to docs/api/utils.gradient_checkpointing.offload_cpu.html
index eac552d91..0d4ddef13 100644
--- a/docs/api/utils.gradient_checkpointing.unsloth.html
+++ b/docs/api/utils.gradient_checkpointing.offload_cpu.html
@@ -7,7 +7,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 
 
-<title>utils.gradient_checkpointing.unsloth – Axolotl</title>
+<title>utils.gradient_checkpointing.offload_cpu – Axolotl</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -446,11 +446,11 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
     <h2 id="toc-title">On this page</h2>
    
   <ul>
-  <li><a href="#axolotl.utils.gradient_checkpointing.unsloth" id="toc-axolotl.utils.gradient_checkpointing.unsloth" class="nav-link active" data-scroll-target="#axolotl.utils.gradient_checkpointing.unsloth">utils.gradient_checkpointing.unsloth</a>
+  <li><a href="#axolotl.utils.gradient_checkpointing.offload_cpu" id="toc-axolotl.utils.gradient_checkpointing.offload_cpu" class="nav-link active" data-scroll-target="#axolotl.utils.gradient_checkpointing.offload_cpu">utils.gradient_checkpointing.offload_cpu</a>
   <ul class="collapse">
   <li><a href="#classes" id="toc-classes" class="nav-link" data-scroll-target="#classes">Classes</a>
   <ul class="collapse">
-  <li><a href="#axolotl.utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer" id="toc-axolotl.utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer" class="nav-link" data-scroll-target="#axolotl.utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer">Unsloth_Offloaded_Gradient_Checkpointer</a></li>
+  <li><a href="#axolotl.utils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer" id="toc-axolotl.utils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer" class="nav-link" data-scroll-target="#axolotl.utils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer">CPU_Offloaded_Gradient_Checkpointer</a></li>
   </ul></li>
   </ul></li>
   </ul>
@@ -462,10 +462,10 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 
 
 
-<section id="axolotl.utils.gradient_checkpointing.unsloth" class="level1">
-<h1>utils.gradient_checkpointing.unsloth</h1>
-<p><code>utils.gradient_checkpointing.unsloth</code></p>
-<p>Unsloth checkpointing</p>
+<section id="axolotl.utils.gradient_checkpointing.offload_cpu" class="level1">
+<h1>utils.gradient_checkpointing.offload_cpu</h1>
+<p><code>utils.gradient_checkpointing.offload_cpu</code></p>
+<p>CPU offloaded checkpointing</p>
 <section id="classes" class="level2">
 <h2 class="anchored" data-anchor-id="classes">Classes</h2>
 <table class="caption-top table">
@@ -477,14 +477,14 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 </thead>
 <tbody>
 <tr class="odd">
-<td><a href="#axolotl.utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer">Unsloth_Offloaded_Gradient_Checkpointer</a></td>
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer">CPU_Offloaded_Gradient_Checkpointer</a></td>
 <td>Saves VRAM by smartly offloading to RAM.</td>
 </tr>
 </tbody>
 </table>
-<section id="axolotl.utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer" class="level3">
-<h3 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer">Unsloth_Offloaded_Gradient_Checkpointer</h3>
-<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<section id="axolotl.utils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer" class="level3">
+<h3 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer">CPU_Offloaded_Gradient_Checkpointer</h3>
+<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Saves VRAM by smartly offloading to RAM.
 Tiny hit to performance, since we mask the movement via non blocking calls.</p>
 
diff --git a/docs/api/utils.gradient_checkpointing.offload_disk.html b/docs/api/utils.gradient_checkpointing.offload_disk.html
new file mode 100644
index 000000000..555801511
--- /dev/null
+++ b/docs/api/utils.gradient_checkpointing.offload_disk.html
@@ -0,0 +1,1046 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.7.31">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+
+<title>utils.gradient_checkpointing.offload_disk – Axolotl</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+html { -webkit-text-size-adjust: 100%; }
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../../">
+<link href="../../favicon.jpg" rel="icon" type="image/jpeg">
+<script src="../../site_libs/quarto-html/quarto.js" type="module"></script>
+<script src="../../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
+<script src="../../site_libs/quarto-html/popper.min.js"></script>
+<script src="../../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../../site_libs/quarto-html/quarto-syntax-highlighting-dark-8ef56b68f8fa1e9d2ba328e99e439f80.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../../site_libs/bootstrap/bootstrap-ce762b396f898894284bb8eeee180359.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
+<script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+
+
+<link rel="stylesheet" href="../../styles.css">
+</head>
+
+<body class="nav-sidebar docked nav-fixed quarto-light">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+    <nav class="navbar navbar-expand " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a href="../../index.html" class="navbar-brand navbar-brand-logo">
+    <img src="../../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo">
+    </a>
+  </div>
+        <div class="quarto-navbar-tools tools-wide tools-end">
+    <a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
+    <a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
+</div>
+          <div id="quarto-search" class="" title="Search"></div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Home</span></a>
+  </div>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
+ <span class="menu-text">Getting Started</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/getting-started.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quickstart</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/installation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Installation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/inference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Inference and Merging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/cli.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Command Line Interface (CLI)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/config.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Config Reference</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/api" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">API Reference</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Formats</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Pre-training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Instruction Tuning</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Conversation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Stepwise Supervised Format</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Template-Free</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true">
+ <span class="menu-text">Deployments</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/docker.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Docker</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi-GPU</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/multi-node.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi Node</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">AMD GPUs on HPC Systems</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/mac.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mac M-series</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true">
+ <span class="menu-text">How To Guides</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/multimodal.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/rlhf.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">RLHF (Beta)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Reward Modelling</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Learning Rate Groups</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">LoRA Optimizations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Loading</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true">
+ <span class="menu-text">Core Concepts</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Batch size vs Gradient accumulation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Preprocessing</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/multipack.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multipack (Sample Packing)</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
+ <span class="menu-text">Advanced Features</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FDSP + QLoRA</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/unsloth.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Unsloth</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/torchao.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">PyTorch ao</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Integrations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Sequence Parallelism</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
+ <span class="menu-text">Troubleshooting</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/faq.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FAQ</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/debugging.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Debugging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/nccl.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">NCCL</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">On this page</h2>
+   
+  <ul>
+  <li><a href="#axolotl.utils.gradient_checkpointing.offload_disk" id="toc-axolotl.utils.gradient_checkpointing.offload_disk" class="nav-link active" data-scroll-target="#axolotl.utils.gradient_checkpointing.offload_disk">utils.gradient_checkpointing.offload_disk</a>
+  <ul class="collapse">
+  <li><a href="#classes" id="toc-classes" class="nav-link" data-scroll-target="#classes">Classes</a>
+  <ul class="collapse">
+  <li><a href="#axolotl.utils.gradient_checkpointing.offload_disk.Disco" id="toc-axolotl.utils.gradient_checkpointing.offload_disk.Disco" class="nav-link" data-scroll-target="#axolotl.utils.gradient_checkpointing.offload_disk.Disco">Disco</a></li>
+  <li><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager" id="toc-axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager" class="nav-link" data-scroll-target="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager">DiskOffloadManager</a></li>
+  </ul></li>
+  </ul></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content"><header id="title-block-header" class="quarto-title-block"></header>
+
+
+
+
+<section id="axolotl.utils.gradient_checkpointing.offload_disk" class="level1">
+<h1>utils.gradient_checkpointing.offload_disk</h1>
+<p><code>utils.gradient_checkpointing.offload_disk</code></p>
+<p>DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching</p>
+<section id="classes" class="level2">
+<h2 class="anchored" data-anchor-id="classes">Classes</h2>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Name</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.Disco">Disco</a></td>
+<td>Disco: DIsk-based Storage and Checkpointing with Optimized prefetching</td>
+</tr>
+<tr class="even">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager">DiskOffloadManager</a></td>
+<td>Manages offloaded tensors and handles prefetching in a separate thread.</td>
+</tr>
+</tbody>
+</table>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.Disco" class="level3">
+<h3 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.Disco">Disco</h3>
+<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.Disco()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Disco: DIsk-based Storage and Checkpointing with Optimized prefetching
+Advanced disk-based gradient checkpointer with prefetching.</p>
+<section id="methods" class="level4">
+<h4 class="anchored" data-anchor-id="methods">Methods</h4>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Name</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.Disco.backward">backward</a></td>
+<td>Backward pass that loads activations from disk with prefetching</td>
+</tr>
+<tr class="even">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.Disco.forward">forward</a></td>
+<td>Forward pass that offloads activations to disk asynchronously</td>
+</tr>
+<tr class="odd">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.Disco.get_instance">get_instance</a></td>
+<td>Get or create the offload manager</td>
+</tr>
+</tbody>
+</table>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.Disco.backward" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.Disco.backward">backward</h5>
+<div class="sourceCode" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.Disco.backward(ctx, <span class="op">*</span>grad_outputs)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Backward pass that loads activations from disk with prefetching</p>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.Disco.forward" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.Disco.forward">forward</h5>
+<div class="sourceCode" id="cb3"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.Disco.forward(</span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a>    ctx,</span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a>    forward_function,</span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>    hidden_states,</span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>    <span class="op">*</span>args,</span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    prefetch_size<span class="op">=</span><span class="dv">1</span>,</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    prefetch_to_gpu<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>    save_workers<span class="op">=</span><span class="dv">4</span>,</span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Forward pass that offloads activations to disk asynchronously</p>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.Disco.get_instance" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.Disco.get_instance">get_instance</h5>
+<div class="sourceCode" id="cb4"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.Disco.get_instance(</span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a>    prefetch_size<span class="op">=</span><span class="dv">1</span>,</span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a>    prefetch_to_gpu<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a>    save_workers<span class="op">=</span><span class="dv">4</span>,</span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Get or create the offload manager</p>
+</section>
+</section>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager" class="level3">
+<h3 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager">DiskOffloadManager</h3>
+<div class="sourceCode" id="cb5"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.DiskOffloadManager(</span>
+<span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a>    <span class="va">self</span>,</span>
+<span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a>    prefetch_size<span class="op">=</span><span class="dv">3</span>,</span>
+<span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a>    prefetch_to_gpu<span class="op">=</span><span class="va">True</span>,</span>
+<span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a>    save_workers<span class="op">=</span><span class="dv">4</span>,</span>
+<span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Manages offloaded tensors and handles prefetching in a separate thread.
+Includes synchronization to prevent race conditions.</p>
+<section id="methods-1" class="level4">
+<h4 class="anchored" data-anchor-id="methods-1">Methods</h4>
+<table class="caption-top table">
+<thead>
+<tr class="header">
+<th>Name</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup">cleanup</a></td>
+<td>Clean up all temp files and stop prefetch thread with proper synchronization</td>
+</tr>
+<tr class="even">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor">cleanup_tensor</a></td>
+<td>Clean up a specific tensor file after it’s been used</td>
+</tr>
+<tr class="odd">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor">load_tensor</a></td>
+<td>Load tensor from disk or prefetch cache with proper synchronization</td>
+</tr>
+<tr class="even">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor">save_tensor</a></td>
+<td>Save tensor to disk asynchronously and return file path with thread-safe operations</td>
+</tr>
+<tr class="odd">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch">trigger_prefetch</a></td>
+<td>Trigger prefetching of the next N tensors with proper synchronization</td>
+</tr>
+<tr class="even">
+<td><a href="#axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save">wait_for_save</a></td>
+<td>Wait for a tensor to be saved to disk</td>
+</tr>
+</tbody>
+</table>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup">cleanup</h5>
+<div class="sourceCode" id="cb6"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb6-1"><a href="#cb6-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Clean up all temp files and stop prefetch thread with proper synchronization</p>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor">cleanup_tensor</h5>
+<div class="sourceCode" id="cb7"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb7-1"><a href="#cb7-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(</span>
+<span id="cb7-2"><a href="#cb7-2" aria-hidden="true" tabindex="-1"></a>    file_path,</span>
+<span id="cb7-3"><a href="#cb7-3" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Clean up a specific tensor file after it’s been used</p>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor">load_tensor</h5>
+<div class="sourceCode" id="cb8"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb8-1"><a href="#cb8-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(</span>
+<span id="cb8-2"><a href="#cb8-2" aria-hidden="true" tabindex="-1"></a>    file_path,</span>
+<span id="cb8-3"><a href="#cb8-3" aria-hidden="true" tabindex="-1"></a>    target_device<span class="op">=</span><span class="st">'cuda'</span>,</span>
+<span id="cb8-4"><a href="#cb8-4" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Load tensor from disk or prefetch cache with proper synchronization</p>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor">save_tensor</h5>
+<div class="sourceCode" id="cb9"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb9-1"><a href="#cb9-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(tensor)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Save tensor to disk asynchronously and return file path with thread-safe operations</p>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch">trigger_prefetch</h5>
+<div class="sourceCode" id="cb10"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb10-1"><a href="#cb10-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(</span>
+<span id="cb10-2"><a href="#cb10-2" aria-hidden="true" tabindex="-1"></a>    n<span class="op">=</span><span class="va">None</span>,</span>
+<span id="cb10-3"><a href="#cb10-3" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Trigger prefetching of the next N tensors with proper synchronization</p>
+</section>
+<section id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save" class="level5">
+<h5 class="anchored" data-anchor-id="axolotl.utils.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save">wait_for_save</h5>
+<div class="sourceCode" id="cb11"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb11-1"><a href="#cb11-1" aria-hidden="true" tabindex="-1"></a>utils.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(</span>
+<span id="cb11-2"><a href="#cb11-2" aria-hidden="true" tabindex="-1"></a>    file_path,</span>
+<span id="cb11-3"><a href="#cb11-3" aria-hidden="true" tabindex="-1"></a>    timeout<span class="op">=</span><span class="va">None</span>,</span>
+<span id="cb11-4"><a href="#cb11-4" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>Wait for a tensor to be saved to disk</p>
+
+
+</section>
+</section>
+</section>
+</section>
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+  window.document.addEventListener("DOMContentLoaded", function (event) {
+    const icon = "";
+    const anchorJS = new window.AnchorJS();
+    anchorJS.options = {
+      placement: 'right',
+      icon: icon
+    };
+    anchorJS.add('.anchored');
+    const isCodeAnnotation = (el) => {
+      for (const clz of el.classList) {
+        if (clz.startsWith('code-annotation-')) {                     
+          return true;
+        }
+      }
+      return false;
+    }
+    const onCopySuccess = function(e) {
+      // button target
+      const button = e.trigger;
+      // don't keep focus
+      button.blur();
+      // flash "checked"
+      button.classList.add('code-copy-button-checked');
+      var currentTitle = button.getAttribute("title");
+      button.setAttribute("title", "Copied!");
+      let tooltip;
+      if (window.bootstrap) {
+        button.setAttribute("data-bs-toggle", "tooltip");
+        button.setAttribute("data-bs-placement", "left");
+        button.setAttribute("data-bs-title", "Copied!");
+        tooltip = new bootstrap.Tooltip(button, 
+          { trigger: "manual", 
+            customClass: "code-copy-button-tooltip",
+            offset: [0, -8]});
+        tooltip.show();    
+      }
+      setTimeout(function() {
+        if (tooltip) {
+          tooltip.hide();
+          button.removeAttribute("data-bs-title");
+          button.removeAttribute("data-bs-toggle");
+          button.removeAttribute("data-bs-placement");
+        }
+        button.setAttribute("title", currentTitle);
+        button.classList.remove('code-copy-button-checked');
+      }, 1000);
+      // clear code selection
+      e.clearSelection();
+    }
+    const getTextToCopy = function(trigger) {
+        const codeEl = trigger.previousElementSibling.cloneNode(true);
+        for (const childEl of codeEl.children) {
+          if (isCodeAnnotation(childEl)) {
+            childEl.remove();
+          }
+        }
+        return codeEl.innerText;
+    }
+    const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+      text: getTextToCopy
+    });
+    clipboard.on('success', onCopySuccess);
+    if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+      const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+        text: getTextToCopy,
+        container: window.document.getElementById('quarto-embedded-source-code-modal')
+      });
+      clipboardModal.on('success', onCopySuccess);
+    }
+      var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+      var mailtoRegex = new RegExp(/^mailto:/);
+        var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
+      var isInternal = (href) => {
+          return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+      }
+      // Inspect non-navigation links and adorn them if external
+     var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+      for (var i=0; i<links.length; i++) {
+        const link = links[i];
+        if (!isInternal(link.href)) {
+          // undo the damage that might have been done by quarto-nav.js in the case of
+          // links that we want to consider external
+          if (link.dataset.originalHref !== undefined) {
+            link.href = link.dataset.originalHref;
+          }
+        }
+      }
+    function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+      const config = {
+        allowHTML: true,
+        maxWidth: 500,
+        delay: 100,
+        arrow: false,
+        appendTo: function(el) {
+            return el.parentElement;
+        },
+        interactive: true,
+        interactiveBorder: 10,
+        theme: 'quarto',
+        placement: 'bottom-start',
+      };
+      if (contentFn) {
+        config.content = contentFn;
+      }
+      if (onTriggerFn) {
+        config.onTrigger = onTriggerFn;
+      }
+      if (onUntriggerFn) {
+        config.onUntrigger = onUntriggerFn;
+      }
+      window.tippy(el, config); 
+    }
+    const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+    for (var i=0; i<noterefs.length; i++) {
+      const ref = noterefs[i];
+      tippyHover(ref, function() {
+        // use id or data attribute instead here
+        let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+        try { href = new URL(href).hash; } catch {}
+        const id = href.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note) {
+          return note.innerHTML;
+        } else {
+          return "";
+        }
+      });
+    }
+    const xrefs = window.document.querySelectorAll('a.quarto-xref');
+    const processXRef = (id, note) => {
+      // Strip column container classes
+      const stripColumnClz = (el) => {
+        el.classList.remove("page-full", "page-columns");
+        if (el.children) {
+          for (const child of el.children) {
+            stripColumnClz(child);
+          }
+        }
+      }
+      stripColumnClz(note)
+      if (id === null || id.startsWith('sec-')) {
+        // Special case sections, only their first couple elements
+        const container = document.createElement("div");
+        if (note.children && note.children.length > 2) {
+          container.appendChild(note.children[0].cloneNode(true));
+          for (let i = 1; i < note.children.length; i++) {
+            const child = note.children[i];
+            if (child.tagName === "P" && child.innerText === "") {
+              continue;
+            } else {
+              container.appendChild(child.cloneNode(true));
+              break;
+            }
+          }
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(container);
+          }
+          return container.innerHTML
+        } else {
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(note);
+          }
+          return note.innerHTML;
+        }
+      } else {
+        // Remove any anchor links if they are present
+        const anchorLink = note.querySelector('a.anchorjs-link');
+        if (anchorLink) {
+          anchorLink.remove();
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        if (note.classList.contains("callout")) {
+          return note.outerHTML;
+        } else {
+          return note.innerHTML;
+        }
+      }
+    }
+    for (var i=0; i<xrefs.length; i++) {
+      const xref = xrefs[i];
+      tippyHover(xref, undefined, function(instance) {
+        instance.disable();
+        let url = xref.getAttribute('href');
+        let hash = undefined; 
+        if (url.startsWith('#')) {
+          hash = url;
+        } else {
+          try { hash = new URL(url).hash; } catch {}
+        }
+        if (hash) {
+          const id = hash.replace(/^#\/?/, "");
+          const note = window.document.getElementById(id);
+          if (note !== null) {
+            try {
+              const html = processXRef(id, note.cloneNode(true));
+              instance.setContent(html);
+            } finally {
+              instance.enable();
+              instance.show();
+            }
+          } else {
+            // See if we can fetch this
+            fetch(url.split('#')[0])
+            .then(res => res.text())
+            .then(html => {
+              const parser = new DOMParser();
+              const htmlDoc = parser.parseFromString(html, "text/html");
+              const note = htmlDoc.getElementById(id);
+              if (note !== null) {
+                const html = processXRef(id, note);
+                instance.setContent(html);
+              } 
+            }).finally(() => {
+              instance.enable();
+              instance.show();
+            });
+          }
+        } else {
+          // See if we can fetch a full url (with no hash to target)
+          // This is a special case and we should probably do some content thinning / targeting
+          fetch(url)
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.querySelector('main.content');
+            if (note !== null) {
+              // This should only happen for chapter cross references
+              // (since there is no id in the URL)
+              // remove the first header
+              if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+                note.children[0].remove();
+              }
+              const html = processXRef(null, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      }, function(instance) {
+      });
+    }
+        let selectedAnnoteEl;
+        const selectorForAnnotation = ( cell, annotation) => {
+          let cellAttr = 'data-code-cell="' + cell + '"';
+          let lineAttr = 'data-code-annotation="' +  annotation + '"';
+          const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+          return selector;
+        }
+        const selectCodeLines = (annoteEl) => {
+          const doc = window.document;
+          const targetCell = annoteEl.getAttribute("data-target-cell");
+          const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+          const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+          const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+          const lineIds = lines.map((line) => {
+            return targetCell + "-" + line;
+          })
+          let top = null;
+          let height = null;
+          let parent = null;
+          if (lineIds.length > 0) {
+              //compute the position of the single el (top and bottom and make a div)
+              const el = window.document.getElementById(lineIds[0]);
+              top = el.offsetTop;
+              height = el.offsetHeight;
+              parent = el.parentElement.parentElement;
+            if (lineIds.length > 1) {
+              const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+              const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+              height = bottom - top;
+            }
+            if (top !== null && height !== null && parent !== null) {
+              // cook up a div (if necessary) and position it 
+              let div = window.document.getElementById("code-annotation-line-highlight");
+              if (div === null) {
+                div = window.document.createElement("div");
+                div.setAttribute("id", "code-annotation-line-highlight");
+                div.style.position = 'absolute';
+                parent.appendChild(div);
+              }
+              div.style.top = top - 2 + "px";
+              div.style.height = height + 4 + "px";
+              div.style.left = 0;
+              let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+              if (gutterDiv === null) {
+                gutterDiv = window.document.createElement("div");
+                gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+                gutterDiv.style.position = 'absolute';
+                const codeCell = window.document.getElementById(targetCell);
+                const gutter = codeCell.querySelector('.code-annotation-gutter');
+                gutter.appendChild(gutterDiv);
+              }
+              gutterDiv.style.top = top - 2 + "px";
+              gutterDiv.style.height = height + 4 + "px";
+            }
+            selectedAnnoteEl = annoteEl;
+          }
+        };
+        const unselectCodeLines = () => {
+          const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+          elementsIds.forEach((elId) => {
+            const div = window.document.getElementById(elId);
+            if (div) {
+              div.remove();
+            }
+          });
+          selectedAnnoteEl = undefined;
+        };
+          // Handle positioning of the toggle
+      window.addEventListener(
+        "resize",
+        throttle(() => {
+          elRect = undefined;
+          if (selectedAnnoteEl) {
+            selectCodeLines(selectedAnnoteEl);
+          }
+        }, 10)
+      );
+      function throttle(fn, ms) {
+      let throttle = false;
+      let timer;
+        return (...args) => {
+          if(!throttle) { // first call gets through
+              fn.apply(this, args);
+              throttle = true;
+          } else { // all the others get throttled
+              if(timer) clearTimeout(timer); // cancel #2
+              timer = setTimeout(() => {
+                fn.apply(this, args);
+                timer = throttle = false;
+              }, ms);
+          }
+        };
+      }
+        // Attach click handler to the DT
+        const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+        for (const annoteDlNode of annoteDls) {
+          annoteDlNode.addEventListener('click', (event) => {
+            const clickedEl = event.target;
+            if (clickedEl !== selectedAnnoteEl) {
+              unselectCodeLines();
+              const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+              if (activeEl) {
+                activeEl.classList.remove('code-annotation-active');
+              }
+              selectCodeLines(clickedEl);
+              clickedEl.classList.add('code-annotation-active');
+            } else {
+              // Unselect the line
+              unselectCodeLines();
+              clickedEl.classList.remove('code-annotation-active');
+            }
+          });
+        }
+    const findCites = (el) => {
+      const parentEl = el.parentElement;
+      if (parentEl) {
+        const cites = parentEl.dataset.cites;
+        if (cites) {
+          return {
+            el,
+            cites: cites.split(' ')
+          };
+        } else {
+          return findCites(el.parentElement)
+        }
+      } else {
+        return undefined;
+      }
+    };
+    var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+    for (var i=0; i<bibliorefs.length; i++) {
+      const ref = bibliorefs[i];
+      const citeInfo = findCites(ref);
+      if (citeInfo) {
+        tippyHover(citeInfo.el, function() {
+          var popup = window.document.createElement('div');
+          citeInfo.cites.forEach(function(cite) {
+            var citeDiv = window.document.createElement('div');
+            citeDiv.classList.add('hanging-indent');
+            citeDiv.classList.add('csl-entry');
+            var biblioDiv = window.document.getElementById('ref-' + cite);
+            if (biblioDiv) {
+              citeDiv.innerHTML = biblioDiv.innerHTML;
+            }
+            popup.appendChild(citeDiv);
+          });
+          return popup.innerHTML;
+        });
+      }
+    }
+  });
+  </script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/config.html b/docs/config.html
index 9468bd915..d37cb9396 100644
--- a/docs/config.html
+++ b/docs/config.html
@@ -1007,7 +1007,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 <span id="cb1-533"><a href="#cb1-533" aria-hidden="true" tabindex="-1"></a><span class="co"># Note that training loss may have an oscillating pattern with this enabled.</span></span>
 <span id="cb1-534"><a href="#cb1-534" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
 <span id="cb1-535"><a href="#cb1-535" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-536"><a href="#cb1-536" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use gradient checkpointing. Available options are: true, false, "offload".</span></span>
+<span id="cb1-536"><a href="#cb1-536" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".</span></span>
 <span id="cb1-537"><a href="#cb1-537" aria-hidden="true" tabindex="-1"></a><span class="co"># https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing</span></span>
 <span id="cb1-538"><a href="#cb1-538" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_checkpointing</span><span class="kw">:</span><span class="at"> </span><span class="ch">false</span></span>
 <span id="cb1-539"><a href="#cb1-539" aria-hidden="true" tabindex="-1"></a><span class="co"># additional kwargs to pass to the trainer for gradient checkpointing</span></span>
diff --git a/search.json b/search.json
index cdb78aff6..5d9ef79e9 100644
--- a/search.json
+++ b/search.json
@@ -227,95 +227,123 @@
     "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    self,\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    self,\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    self,\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    self,\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler"
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html",
-    "href": "docs/api/utils.callbacks.profiler.html",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/utils.gradient_checkpointing.offload_disk.html",
+    "href": "docs/api/utils.gradient_checkpointing.offload_disk.html",
+    "title": "utils.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(self, steps_to_profile=5)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+    "text": "utils.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\nName\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco.backward(ctx, *grad_outputs)\nBackward pass that loads activations from disk with prefetching\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    self,\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(tensor)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
-    "href": "docs/api/utils.callbacks.profiler.html#classes",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/utils.gradient_checkpointing.offload_disk.html#classes",
+    "href": "docs/api/utils.gradient_checkpointing.offload_disk.html#classes",
+    "title": "utils.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(self, steps_to_profile=5)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+    "text": "Name\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco.backward(ctx, *grad_outputs)\nBackward pass that loads activations from disk with prefetching\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nutils.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    self,\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(tensor)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nutils.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html",
-    "href": "docs/api/utils.callbacks.mlflow_.html",
-    "title": "utils.callbacks.mlflow_",
+    "objectID": "docs/api/utils.lora_embeddings.html",
+    "href": "docs/api/utils.lora_embeddings.html",
+    "title": "utils.lora_embeddings",
     "section": "",
-    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(\n    self,\n    axolotl_config_path,\n)\nCallback to save axolotl config to mlflow"
+    "text": "utils.lora_embeddings\nhelpers for lora embeddings\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_linear_embedding_layers\nreturns the linear embedding layers needed for loras, dependent on the model arch\n\n\n\n\n\nutils.lora_embeddings.get_linear_embedding_layers(model_type)\nreturns the linear embedding layers needed for loras, dependent on the model arch"
   },
   {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "title": "utils.callbacks.mlflow_",
+    "objectID": "docs/api/utils.lora_embeddings.html#functions",
+    "href": "docs/api/utils.lora_embeddings.html#functions",
+    "title": "utils.lora_embeddings",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(\n    self,\n    axolotl_config_path,\n)\nCallback to save axolotl config to mlflow"
+    "text": "Name\nDescription\n\n\n\n\nget_linear_embedding_layers\nreturns the linear embedding layers needed for loras, dependent on the model arch\n\n\n\n\n\nutils.lora_embeddings.get_linear_embedding_layers(model_type)\nreturns the linear embedding layers needed for loras, dependent on the model arch"
   },
   {
-    "objectID": "docs/api/utils.bench.html",
-    "href": "docs/api/utils.bench.html",
-    "title": "utils.bench",
+    "objectID": "docs/api/core.chat.messages.html",
+    "href": "docs/api/core.chat.messages.html",
+    "title": "core.chat.messages",
     "section": "",
-    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "core.chat.messages\ninternal message representations of chat messages\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
   },
   {
-    "objectID": "docs/api/utils.bench.html#functions",
-    "href": "docs/api/utils.bench.html#functions",
-    "title": "utils.bench",
+    "objectID": "docs/api/core.chat.messages.html#classes",
+    "href": "docs/api/core.chat.messages.html#classes",
+    "title": "core.chat.messages",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "Name\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
   },
   {
-    "objectID": "docs/api/utils.collators.mm_chat.html",
-    "href": "docs/api/utils.collators.mm_chat.html",
-    "title": "utils.collators.mm_chat",
+    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "title": "monkeypatch.mistral_attn_hijack_flash",
     "section": "",
-    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    self,\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+    "text": "monkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\n\n\n\nName\nDescription\n\n\n\n\nMistralDecoderLayer\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer()\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
   },
   {
-    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
-    "href": "docs/api/utils.collators.mm_chat.html#classes",
-    "title": "utils.collators.mm_chat",
+    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#classes",
+    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#classes",
+    "title": "monkeypatch.mistral_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    self,\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+    "text": "Name\nDescription\n\n\n\n\nMistralDecoderLayer\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer()\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.mistral_attn_hijack_flash",
     "section": "",
-    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "Name\nDescription\n\n\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/integrations.lm_eval.args.html",
+    "href": "docs/api/integrations.lm_eval.args.html",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.llama3.html",
-    "href": "docs/api/prompt_strategies.kto.llama3.html",
-    "title": "prompt_strategies.kto.llama3",
+    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
+    "href": "docs/api/integrations.lm_eval.args.html#classes",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "prompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.kto.llama3.html#functions",
-    "title": "prompt_strategies.kto.llama3",
+    "objectID": "docs/api/prompt_strategies.chat_template.html",
+    "href": "docs/api/prompt_strategies.chat_template.html",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    self,\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    roles=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(turns, turn_idx)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
   },
   {
-    "objectID": "docs/api/kernels.utils.html",
-    "href": "docs/api/kernels.utils.html",
-    "title": "kernels.utils",
+    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
+    "href": "docs/api/prompt_strategies.chat_template.html#classes",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
+    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    self,\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    roles=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(turns, turn_idx)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+  },
+  {
+    "objectID": "docs/api/integrations.kd.trainer.html",
+    "href": "docs/api/integrations.kd.trainer.html",
+    "title": "integrations.kd.trainer",
+    "section": "",
+    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(\n    self,\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+  },
+  {
+    "objectID": "docs/api/integrations.kd.trainer.html#classes",
+    "href": "docs/api/integrations.kd.trainer.html#classes",
+    "title": "integrations.kd.trainer",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(\n    self,\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+  },
+  {
+    "objectID": "docs/api/utils.gradient_checkpointing.offload_cpu.html",
+    "href": "docs/api/utils.gradient_checkpointing.offload_cpu.html",
+    "title": "utils.gradient_checkpointing.offload_cpu",
+    "section": "",
+    "text": "utils.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nutils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer()\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+  },
+  {
+    "objectID": "docs/api/utils.gradient_checkpointing.offload_cpu.html#classes",
+    "href": "docs/api/utils.gradient_checkpointing.offload_cpu.html#classes",
+    "title": "utils.gradient_checkpointing.offload_cpu",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nutils.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer()\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
   },
   {
     "objectID": "docs/api/monkeypatch.transformers_fa_utils.html",
@@ -350,7 +378,7 @@
     "href": "docs/api/index.html",
     "title": "API Reference",
     "section": "",
-    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.trainer_builder\nBuilder for the training args and trainer\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils\nUtility methods for axolotl CLI.\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL PPO trainer\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.relora\nModule for ReLoRA trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\ncore.trainers.mixins.sequence_parallel\nModule for Axolotl trainer sequence parallelism mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.attention.mllama\nMonkeypatch for Vision Llama for FA2 support\n\n\nmonkeypatch.data.batch_dataset_fetcher\nmonkey patches for the dataset fetcher to handle batches of packed indexes\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.models\nModule for models and model loading\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.lora_embeddings\nhelpers for lora embeddings\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nutility helpers for distributed checks\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\ndata handling specific to SFT\n\n\nutils.gradient_checkpointing.unsloth\nUnsloth checkpointing\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks"
+    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.trainer_builder\nBuilder for the training args and trainer\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils\nUtility methods for axolotl CLI.\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL PPO trainer\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.relora\nModule for ReLoRA trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\ncore.trainers.mixins.sequence_parallel\nModule for Axolotl trainer sequence parallelism mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.attention.mllama\nMonkeypatch for Vision Llama for FA2 support\n\n\nmonkeypatch.data.batch_dataset_fetcher\nmonkey patches for the dataset fetcher to handle batches of packed indexes\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.models\nModule for models and model loading\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.lora_embeddings\nhelpers for lora embeddings\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nutility helpers for distributed checks\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\ndata handling specific to SFT\n\n\nutils.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nutils.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks"
   },
   {
     "objectID": "docs/api/index.html#core",
@@ -413,7 +441,7 @@
     "href": "docs/api/index.html#utils",
     "title": "API Reference",
     "section": "",
-    "text": "Utility functions\n\n\n\nutils.models\nModule for models and model loading\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.lora_embeddings\nhelpers for lora embeddings\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nutility helpers for distributed checks\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\ndata handling specific to SFT\n\n\nutils.gradient_checkpointing.unsloth\nUnsloth checkpointing"
+    "text": "Utility functions\n\n\n\nutils.models\nModule for models and model loading\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.lora_embeddings\nhelpers for lora embeddings\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nutility helpers for distributed checks\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\ndata handling specific to SFT\n\n\nutils.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nutils.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching"
   },
   {
     "objectID": "docs/api/index.html#schemas",
@@ -787,1078 +815,943 @@
     "text": "Name\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "title": "prompt_strategies.dpo.user_defined",
+    "objectID": "docs/api/prompt_strategies.input_output.html",
+    "href": "docs/api/prompt_strategies.input_output.html",
+    "title": "prompt_strategies.input_output",
     "section": "",
-    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
+    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    self,\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
   },
   {
-    "objectID": "docs/api/monkeypatch.multipack.html",
-    "href": "docs/api/monkeypatch.multipack.html",
-    "title": "monkeypatch.multipack",
+    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
+    "href": "docs/api/prompt_strategies.input_output.html#classes",
+    "title": "prompt_strategies.input_output",
     "section": "",
-    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
+    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    self,\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/cli.sweeps.html",
+    "href": "docs/api/cli.sweeps.html",
+    "title": "cli.sweeps",
     "section": "",
-    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "cli.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, list]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/cli.sweeps.html#functions",
+    "href": "docs/api/cli.sweeps.html#functions",
+    "title": "cli.sweeps",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Name\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, list]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html",
-    "href": "docs/api/utils.callbacks.perplexity.html",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html",
+    "href": "docs/api/core.trainers.grpo.trainer.html",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(self, tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    self,\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer()\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
-    "href": "docs/api/utils.callbacks.perplexity.html#classes",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(self, tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    self,\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer()\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/logging_config.html",
-    "href": "docs/api/logging_config.html",
-    "title": "logging_config",
+    "objectID": "docs/api/cli.cloud.modal_.html",
+    "href": "docs/api/cli.cloud.modal_.html",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "logging_config\nCommon logging module for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(self, config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
   },
   {
-    "objectID": "docs/api/logging_config.html#classes",
-    "href": "docs/api/logging_config.html#classes",
-    "title": "logging_config",
+    "objectID": "docs/api/cli.cloud.modal_.html#classes",
+    "href": "docs/api/cli.cloud.modal_.html#classes",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
+    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(self, config, app=None)\nModal Cloud implementation."
   },
   {
-    "objectID": "docs/api/logging_config.html#functions",
-    "href": "docs/api/logging_config.html#functions",
-    "title": "logging_config",
+    "objectID": "docs/api/cli.cloud.modal_.html#functions",
+    "href": "docs/api/cli.cloud.modal_.html#functions",
+    "title": "cli.cloud.modal_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/integrations.base.html",
+    "href": "docs/api/integrations.base.html",
+    "title": "integrations.base",
     "section": "",
-    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\nintegrations.base.BasePlugin(self)\nBase class for all plugins. Defines the interface for plugin methods.\nAttributes:\nNone\nMethods:\nregister(cfg): Registers the plugin with the given configuration.\nload_datasets(cfg): Loads and preprocesses the dataset for training.\npre_model_load(cfg): Performs actions before the model is loaded.\npost_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.\npre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\npost_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\npost_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.\npost_trainer_create(cfg, trainer): Performs actions after the trainer is created.\ncreate_optimizer(cfg, trainer): Creates and returns an optimizer for training.\ncreate_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.\nadd_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.\nadd_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer.\n\n\nadd_callbacks_pre_trainer\nsetup callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer.\nThis is useful for callbacks that require access to the model or trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nsetup callbacks before creating the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added to the TrainingArgs\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\noptimizer\nobject\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\nLRScheduler\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the trainer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe axolotl configuration\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins.\nIt should be a singleton so it can be accessed from anywhere in the codebase.\nAttributes:\nplugins (ListBasePlugin): A list of loaded plugins.\nMethods:\nget_instance(): Static method to get the singleton instance of PluginManager.\nregister(plugin_name: str): Registers a new plugin by its name.\npre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager.\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model has been loaded\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\nParameters:\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler, or None if none was found.\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\nParameters:\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\nReturns:\nlist[str]: A list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager.\nIf the instance doesn’t exist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The trainer class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nThe configuration for the plugins.\nrequired\n\n\npreprocess\n\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\nbut before any adapters have been applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugins.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model has been loaded\ninclusive of any adapters\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\nParameters:\nplugin_name (str): The name of the plugin to be registered.\nReturns:\nNone\nRaises:\nImportError: If the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”.\nThis function splits the plugin name into module and class, imports the module,\nretrieves the class from the module, and creates an instance of the class.\nParameters:\nplugin_name (str): The name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nReturns:\nBasePlugin: An instance of the loaded plugin.\nRaises:\nImportError: If the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/integrations.base.html#classes",
+    "href": "docs/api/integrations.base.html#classes",
+    "title": "integrations.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\nintegrations.base.BasePlugin(self)\nBase class for all plugins. Defines the interface for plugin methods.\nAttributes:\nNone\nMethods:\nregister(cfg): Registers the plugin with the given configuration.\nload_datasets(cfg): Loads and preprocesses the dataset for training.\npre_model_load(cfg): Performs actions before the model is loaded.\npost_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.\npre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\npost_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\npost_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.\npost_trainer_create(cfg, trainer): Performs actions after the trainer is created.\ncreate_optimizer(cfg, trainer): Creates and returns an optimizer for training.\ncreate_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.\nadd_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.\nadd_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer.\n\n\nadd_callbacks_pre_trainer\nsetup callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer.\nThis is useful for callbacks that require access to the model or trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nsetup callbacks before creating the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added to the TrainingArgs\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\noptimizer\nobject\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\nLRScheduler\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the trainer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe axolotl configuration\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins.\nIt should be a singleton so it can be accessed from anywhere in the codebase.\nAttributes:\nplugins (ListBasePlugin): A list of loaded plugins.\nMethods:\nget_instance(): Static method to get the singleton instance of PluginManager.\nregister(plugin_name: str): Registers a new plugin by its name.\npre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager.\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model has been loaded\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\nParameters:\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler, or None if none was found.\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\nParameters:\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\nReturns:\nlist[str]: A list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager.\nIf the instance doesn’t exist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The trainer class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nThe configuration for the plugins.\nrequired\n\n\npreprocess\n\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\nbut before any adapters have been applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugins.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model has been loaded\ninclusive of any adapters\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\nParameters:\nplugin_name (str): The name of the plugin to be registered.\nReturns:\nNone\nRaises:\nImportError: If the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/utils.schemas.integrations.html",
-    "href": "docs/api/utils.schemas.integrations.html",
-    "title": "utils.schemas.integrations",
+    "objectID": "docs/api/integrations.base.html#functions",
+    "href": "docs/api/integrations.base.html#functions",
+    "title": "integrations.base",
     "section": "",
-    "text": "utils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\n\n\n\nName\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
+    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”.\nThis function splits the plugin name into module and class, imports the module,\nretrieves the class from the module, and creates an instance of the class.\nParameters:\nplugin_name (str): The name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nReturns:\nBasePlugin: An instance of the loaded plugin.\nRaises:\nImportError: If the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/utils.schemas.integrations.html#classes",
-    "href": "docs/api/utils.schemas.integrations.html#classes",
-    "title": "utils.schemas.integrations",
+    "objectID": "docs/api/kernels.swiglu.html",
+    "href": "docs/api/kernels.swiglu.html",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
+    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "objectID": "docs/api/kernels.swiglu.html#functions",
+    "href": "docs/api/kernels.swiglu.html#functions",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "monkeypatch.data.batch_dataset_fetcher\nmonkeypatch.data.batch_dataset_fetcher\nmonkey patches for the dataset fetcher to handle batches of packed indexes"
+    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/utils.freeze.html",
+    "href": "docs/api/utils.freeze.html",
+    "title": "utils.freeze",
     "section": "",
-    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(self, pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/utils.freeze.html#classes",
+    "href": "docs/api/utils.freeze.html#classes",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
+    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(self, pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
   },
   {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "title": "cli.merge_sharded_fsdp_weights",
+    "objectID": "docs/api/utils.freeze.html#functions",
+    "href": "docs/api/utils.freeze.html#functions",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
   },
   {
-    "objectID": "docs/api/cli.preprocess.html",
-    "href": "docs/api/cli.preprocess.html",
-    "title": "cli.preprocess",
+    "objectID": "docs/api/utils.schedulers.html",
+    "href": "docs/api/utils.schedulers.html",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "cli.preprocess\nCLI to run preprocessing of a dataset.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    self,\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.RexLR(\n    self,\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/api/cli.preprocess.html#functions",
-    "href": "docs/api/cli.preprocess.html#functions",
-    "title": "cli.preprocess",
+    "objectID": "docs/api/utils.schedulers.html#classes",
+    "href": "docs/api/utils.schedulers.html#classes",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    self,\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.RexLR(\n    self,\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html",
-    "href": "docs/api/prompt_tokenizers.html",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/utils.schedulers.html#functions",
+    "href": "docs/api/utils.schedulers.html#functions",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "prompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html#classes",
-    "href": "docs/api/prompt_tokenizers.html#classes",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
+    "href": "docs/api/prompt_strategies.kto.user_defined.html",
+    "title": "prompt_strategies.kto.user_defined",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts."
+    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html#functions",
-    "href": "docs/api/prompt_tokenizers.html#functions",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/utils.data.pretraining.html",
+    "href": "docs/api/utils.data.pretraining.html",
+    "title": "utils.data.pretraining",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+    "text": "utils.data.pretraining\nutils.data.pretraining\ndata handling specific to pretraining"
   },
   {
-    "objectID": "docs/api/utils.collators.core.html",
-    "href": "docs/api/utils.collators.core.html",
-    "title": "utils.collators.core",
+    "objectID": "docs/api/core.trainers.trl.html",
+    "href": "docs/api/core.trainers.trl.html",
+    "title": "core.trainers.trl",
     "section": "",
-    "text": "utils.collators.core\nutils.collators.core\nbasic shared collator constants"
+    "text": "core.trainers.trl\nModule for TRL PPO trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\nTRLPPOTrainer\nWrapper for TRL PPO trainer to handle customizations\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer()\nExtend the base CPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer()\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer()\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer()\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer()\nExtend the base RewardTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.TRLPPOTrainer()\nWrapper for TRL PPO trainer to handle customizations"
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "title": "monkeypatch.llama_attn_hijack_xformers",
+    "objectID": "docs/api/core.trainers.trl.html#classes",
+    "href": "docs/api/core.trainers.trl.html#classes",
+    "title": "core.trainers.trl",
     "section": "",
-    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\nTRLPPOTrainer\nWrapper for TRL PPO trainer to handle customizations\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer()\nExtend the base CPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer()\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer()\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer()\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer()\nExtend the base RewardTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.TRLPPOTrainer()\nWrapper for TRL PPO trainer to handle customizations"
   },
   {
-    "objectID": "docs/api/cli.merge_lora.html",
-    "href": "docs/api/cli.merge_lora.html",
-    "title": "cli.merge_lora",
+    "objectID": "docs/api/cli.config.html",
+    "href": "docs/api/cli.config.html",
+    "title": "cli.config",
     "section": "",
-    "text": "cli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/cli.merge_lora.html#functions",
-    "href": "docs/api/cli.merge_lora.html#functions",
-    "title": "cli.merge_lora",
+    "objectID": "docs/api/cli.config.html#functions",
+    "href": "docs/api/cli.config.html#functions",
+    "title": "cli.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/core.chat.format.chatml.html",
-    "href": "docs/api/core.chat.format.chatml.html",
-    "title": "core.chat.format.chatml",
+    "objectID": "docs/api/utils.dict.html",
+    "href": "docs/api/utils.dict.html",
+    "title": "utils.dict",
     "section": "",
-    "text": "core.chat.format.chatml\ncore.chat.format.chatml\nChatML transformation functions for MessageContents"
+    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
   },
   {
-    "objectID": "docs/api/core.trainer_builder.html",
-    "href": "docs/api/core.trainer_builder.html",
-    "title": "core.trainer_builder",
+    "objectID": "docs/api/utils.dict.html#classes",
+    "href": "docs/api/utils.dict.html#classes",
+    "title": "utils.dict",
     "section": "",
-    "text": "core.trainer_builder\nBuilder for the training args and trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\nHFPPOTrainerBuilder\nHF Factory class for PPO Trainer\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.trainer_builder.HFCausalTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL.\n\n\n\ncore.trainer_builder.HFPPOTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nHF Factory class for PPO Trainer\n\n\n\ncore.trainer_builder.HFRLTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\ncore.trainer_builder.TrainerBuilderBase(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.trainer_builder.TrainerBuilderBase.get_post_trainer_create_callbacks(\n    trainer,\n)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
+    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
   },
   {
-    "objectID": "docs/api/core.trainer_builder.html#classes",
-    "href": "docs/api/core.trainer_builder.html#classes",
-    "title": "core.trainer_builder",
+    "objectID": "docs/api/prompt_strategies.messages.chat.html",
+    "href": "docs/api/prompt_strategies.messages.chat.html",
+    "title": "prompt_strategies.messages.chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\nHFPPOTrainerBuilder\nHF Factory class for PPO Trainer\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.trainer_builder.HFCausalTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL.\n\n\n\ncore.trainer_builder.HFPPOTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nHF Factory class for PPO Trainer\n\n\n\ncore.trainer_builder.HFRLTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\ncore.trainer_builder.TrainerBuilderBase(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.trainer_builder.TrainerBuilderBase.get_post_trainer_create_callbacks(\n    trainer,\n)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
+    "text": "prompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    self,\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
-    "href": "docs/api/prompt_strategies.kto.chatml.html",
-    "title": "prompt_strategies.kto.chatml",
+    "objectID": "docs/api/prompt_strategies.messages.chat.html#classes",
+    "href": "docs/api/prompt_strategies.messages.chat.html#classes",
+    "title": "prompt_strategies.messages.chat",
     "section": "",
-    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "Name\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    self,\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "title": "prompt_strategies.kto.chatml",
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
+    "href": "docs/api/prompt_strategies.llama2_chat.html",
+    "title": "prompt_strategies.llama2_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(\n    self,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    self,\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
   },
   {
-    "objectID": "docs/api/prompt_strategies.completion.html",
-    "href": "docs/api/prompt_strategies.completion.html",
-    "title": "prompt_strategies.completion",
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "title": "prompt_strategies.llama2_chat",
     "section": "",
-    "text": "prompt_strategies.completion\nBasic completion text\n\n\n\n\n\nName\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    self,\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
+    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(\n    self,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    self,\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
   },
   {
-    "objectID": "docs/api/prompt_strategies.completion.html#classes",
-    "href": "docs/api/prompt_strategies.completion.html#classes",
-    "title": "prompt_strategies.completion",
+    "objectID": "docs/api/core.datasets.chat.html",
+    "href": "docs/api/core.datasets.chat.html",
+    "title": "core.datasets.chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    self,\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
+    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    self,\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
   },
   {
-    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "title": "monkeypatch.btlm_attn_hijack_flash",
+    "objectID": "docs/api/core.datasets.chat.html#classes",
+    "href": "docs/api/core.datasets.chat.html#classes",
+    "title": "core.datasets.chat",
     "section": "",
-    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
+    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    self,\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
   },
   {
-    "objectID": "docs/api/utils.tokenization.html",
-    "href": "docs/api/utils.tokenization.html",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/utils.models.html",
+    "href": "docs/api/utils.models.html",
+    "title": "utils.models",
     "section": "",
-    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "utils.models\nModule for models and model loading\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelLoader\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nutils.models.ModelLoader(\n    self,\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_llama_derived_model\nModify all llama derived models in one block\n\n\npatch_loss_llama\nPatch loss functions and other optimizations\n\n\nset_attention_config\nsample packing uses custom FA2 patch\n\n\nset_auto_model_loader\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n\n\n\n\n\nutils.models.ModelLoader.patch_llama_derived_model()\nModify all llama derived models in one block\n\n\n\nutils.models.ModelLoader.patch_loss_llama()\nPatch loss functions and other optimizations\n\n\n\nutils.models.ModelLoader.set_attention_config()\nsample packing uses custom FA2 patch\n\n\n\nutils.models.ModelLoader.set_auto_model_loader()\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n(set at __init__). When using a multimodal model, self.auto_model_loader\nshould be set according to the type of the model.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_module_class_from_name\nGets a class from a module by its name.\n\n\nload_model\nLoad a model for a given configuration and tokenizer.\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nutils.models.get_module_class_from_name(module, name)\nGets a class from a module by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodule\ntorch.nn.Module\nThe module to get the class from.\nrequired\n\n\nname\nstr\nThe name of the class.\nrequired\n\n\n\n\n\n\n\nutils.models.load_model(\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nLoad a model for a given configuration and tokenizer.\n\n\n\nutils.models.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nutils.models.modify_tokenizer_files(tokenizer_path, token_mappings, output_dir)\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\nDict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941\n\n\n\n\nutils.models.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nutils.models.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
   },
   {
-    "objectID": "docs/api/utils.tokenization.html#functions",
-    "href": "docs/api/utils.tokenization.html#functions",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/utils.models.html#classes",
+    "href": "docs/api/utils.models.html#classes",
+    "title": "utils.models",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "Name\nDescription\n\n\n\n\nModelLoader\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nutils.models.ModelLoader(\n    self,\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_llama_derived_model\nModify all llama derived models in one block\n\n\npatch_loss_llama\nPatch loss functions and other optimizations\n\n\nset_attention_config\nsample packing uses custom FA2 patch\n\n\nset_auto_model_loader\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n\n\n\n\n\nutils.models.ModelLoader.patch_llama_derived_model()\nModify all llama derived models in one block\n\n\n\nutils.models.ModelLoader.patch_loss_llama()\nPatch loss functions and other optimizations\n\n\n\nutils.models.ModelLoader.set_attention_config()\nsample packing uses custom FA2 patch\n\n\n\nutils.models.ModelLoader.set_auto_model_loader()\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n(set at __init__). When using a multimodal model, self.auto_model_loader\nshould be set according to the type of the model."
   },
   {
-    "objectID": "docs/api/cli.utils.html",
-    "href": "docs/api/cli.utils.html",
-    "title": "cli.utils",
+    "objectID": "docs/api/utils.models.html#functions",
+    "href": "docs/api/utils.models.html#functions",
+    "title": "utils.models",
     "section": "",
-    "text": "cli.utils\nUtility methods for axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ndownload_file\nDownload a single file and return its processing status.\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\n\n\nstrip_optional_type\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\ncli.utils.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.download_file(file_info, raw_base_url, dest_path, dir_prefix)\nDownload a single file and return its processing status.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfile_info\ntuple\nTuple of (file_path, remote_sha).\nrequired\n\n\nraw_base_url\nstr\nBase URL for raw GitHub content.\nrequired\n\n\ndest_path\nPath\nLocal destination directory.\nrequired\n\n\ndir_prefix\nstr\nDirectory prefix to filter files.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[str, str]\nTuple of (file_path, status) where status is ‘new’, ‘updated’, or ‘unchanged’.\n\n\n\n\n\n\n\ncli.utils.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5\n\n\n\n\n\n\n\ncli.utils.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function.\n\n\n\n\n\n\n\ncli.utils.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\nconfig.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).\n\n\n\n\n\n\n\ncli.utils.strip_optional_type(field_type)\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield_type\ntype | str | None\nType of field for Axolotl CLI command.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nIf the input type is Union[T, None] or Optional[T], returns T. Otherwise returns the input type unchanged."
+    "text": "Name\nDescription\n\n\n\n\nget_module_class_from_name\nGets a class from a module by its name.\n\n\nload_model\nLoad a model for a given configuration and tokenizer.\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nutils.models.get_module_class_from_name(module, name)\nGets a class from a module by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodule\ntorch.nn.Module\nThe module to get the class from.\nrequired\n\n\nname\nstr\nThe name of the class.\nrequired\n\n\n\n\n\n\n\nutils.models.load_model(\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nLoad a model for a given configuration and tokenizer.\n\n\n\nutils.models.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nutils.models.modify_tokenizer_files(tokenizer_path, token_mappings, output_dir)\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\nDict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941\n\n\n\n\nutils.models.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nutils.models.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
   },
   {
-    "objectID": "docs/api/cli.utils.html#functions",
-    "href": "docs/api/cli.utils.html#functions",
-    "title": "cli.utils",
+    "objectID": "docs/api/core.trainers.mixins.optimizer.html",
+    "href": "docs/api/core.trainers.mixins.optimizer.html",
+    "title": "core.trainers.mixins.optimizer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ndownload_file\nDownload a single file and return its processing status.\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\n\n\nstrip_optional_type\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\ncli.utils.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.download_file(file_info, raw_base_url, dest_path, dir_prefix)\nDownload a single file and return its processing status.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfile_info\ntuple\nTuple of (file_path, remote_sha).\nrequired\n\n\nraw_base_url\nstr\nBase URL for raw GitHub content.\nrequired\n\n\ndest_path\nPath\nLocal destination directory.\nrequired\n\n\ndir_prefix\nstr\nDirectory prefix to filter files.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[str, str]\nTuple of (file_path, status) where status is ‘new’, ‘updated’, or ‘unchanged’.\n\n\n\n\n\n\n\ncli.utils.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5\n\n\n\n\n\n\n\ncli.utils.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function.\n\n\n\n\n\n\n\ncli.utils.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\nconfig.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).\n\n\n\n\n\n\n\ncli.utils.strip_optional_type(field_type)\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield_type\ntype | str | None\nType of field for Axolotl CLI command.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nIf the input type is Union[T, None] or Optional[T], returns T. Otherwise returns the input type unchanged."
+    "text": "core.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
   },
   {
-    "objectID": "docs/api/cli.main.html",
-    "href": "docs/api/cli.main.html",
-    "title": "cli.main",
+    "objectID": "docs/api/core.trainers.mixins.optimizer.html#classes",
+    "href": "docs/api/core.trainers.mixins.optimizer.html#classes",
+    "title": "core.trainers.mixins.optimizer",
     "section": "",
-    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(config, accelerate, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(config, accelerate, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(config, accelerate, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(config, accelerate, cloud=None, sweep=None, **kwargs)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nOptional[str]\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
+    "text": "Name\nDescription\n\n\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
   },
   {
-    "objectID": "docs/api/cli.main.html#functions",
-    "href": "docs/api/cli.main.html#functions",
-    "title": "cli.main",
+    "objectID": "docs/api/integrations.liger.args.html",
+    "href": "docs/api/integrations.liger.args.html",
+    "title": "integrations.liger.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(config, accelerate, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(config, accelerate, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(config, accelerate, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(config, accelerate, cloud=None, sweep=None, **kwargs)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nOptional[str]\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
+    "text": "integrations.liger.args\nModule for handling LIGER input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
   },
   {
-    "objectID": "docs/api/monkeypatch.attention.mllama.html",
-    "href": "docs/api/monkeypatch.attention.mllama.html",
-    "title": "monkeypatch.attention.mllama",
+    "objectID": "docs/api/integrations.liger.args.html#classes",
+    "href": "docs/api/integrations.liger.args.html#classes",
+    "title": "integrations.liger.args",
     "section": "",
-    "text": "monkeypatch.attention.mllama\nMonkeypatch for Vision Llama for FA2 support\n\n\n\n\n\nName\nDescription\n\n\n\n\nMllamaTextCrossFlashAttention2\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\n\n\nMllamaTextSelfFlashAttention2\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\n\n\n\n\n\nmonkeypatch.attention.mllama.MllamaTextCrossFlashAttention2(\n    self,\n    *args,\n    **kwargs,\n)\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\nimplements the forward pass using Flash Attention for improved performance.\n\n\n\nmonkeypatch.attention.mllama.MllamaTextSelfFlashAttention2(\n    self,\n    config,\n    layer_idx,\n    *args,\n    **kwargs,\n)\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\nimplements the forward pass using Flash Attention for improved performance."
+    "text": "Name\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
   },
   {
-    "objectID": "docs/api/monkeypatch.attention.mllama.html#classes",
-    "href": "docs/api/monkeypatch.attention.mllama.html#classes",
-    "title": "monkeypatch.attention.mllama",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMllamaTextCrossFlashAttention2\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\n\n\nMllamaTextSelfFlashAttention2\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\n\n\n\n\n\nmonkeypatch.attention.mllama.MllamaTextCrossFlashAttention2(\n    self,\n    *args,\n    **kwargs,\n)\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\nimplements the forward pass using Flash Attention for improved performance.\n\n\n\nmonkeypatch.attention.mllama.MllamaTextSelfFlashAttention2(\n    self,\n    config,\n    layer_idx,\n    *args,\n    **kwargs,\n)\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\nimplements the forward pass using Flash Attention for improved performance."
+    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nFusedAttention\nFused QKV Attention layer for incrementally improved training efficiency\n\n\nLlamaDecoderLayer\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.FusedAttention(self, config, q, k, v, o)\nFused QKV Attention layer for incrementally improved training efficiency\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer()\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward\nInput shape: Batch x Time x Channel\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nattention_mask: [bsz, q_len]\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided\n\n\n\nmonkeypatch.llama_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
   },
   {
-    "objectID": "docs/api/cli.checks.html",
-    "href": "docs/api/cli.checks.html",
-    "title": "cli.checks",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#classes",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#classes",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "Name\nDescription\n\n\n\n\nFusedAttention\nFused QKV Attention layer for incrementally improved training efficiency\n\n\nLlamaDecoderLayer\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.FusedAttention(self, config, q, k, v, o)\nFused QKV Attention layer for incrementally improved training efficiency\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer()\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone"
   },
   {
-    "objectID": "docs/api/cli.checks.html#functions",
-    "href": "docs/api/cli.checks.html#functions",
-    "title": "cli.checks",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "Name\nDescription\n\n\n\n\nflashattn_forward\nInput shape: Batch x Time x Channel\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nattention_mask: [bsz, q_len]\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided\n\n\n\nmonkeypatch.llama_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html",
-    "href": "docs/api/prompt_strategies.pygmalion.html",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/integrations.grokfast.optimizer.html",
+    "href": "docs/api/integrations.grokfast.optimizer.html",
+    "title": "integrations.grokfast.optimizer",
     "section": "",
-    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(self, *args, **kwargs)\nPrompter for Pygmalion."
+    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/utils.chat_templates.html",
+    "href": "docs/api/utils.chat_templates.html",
+    "title": "utils.chat_templates",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(self, *args, **kwargs)\nPrompter for Pygmalion."
+    "text": "utils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chat_template\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\nregister_chat_template\nRegisters chat templates.\n\n\n\n\n\nutils.chat_templates.get_chat_template(\n    user_choice,\n    jinja_template=None,\n    tokenizer=None,\n)\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nuser_choice\nstr\nThe user’s choice of template.\nrequired\n\n\njinja_template\nOptional[str]\nThe jinja template string. Defaults to None.\nNone\n\n\ntokenizer\nOptional[PreTrainedTokenizerBase]\nThe tokenizer. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nstr\nstr\nThe chosen template string.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the user_choice is not found in the templates.\n\n\n\n\n\n\n\nutils.chat_templates.register_chat_template(template_name, chat_template)\nRegisters chat templates.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntemplate_name\nstr\nThe name of the template.\nrequired\n\n\nchat_template\nstr\nThe template string.\nrequired"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.sequence_parallel.html",
-    "href": "docs/api/core.trainers.mixins.sequence_parallel.html",
-    "title": "core.trainers.mixins.sequence_parallel",
+    "objectID": "docs/api/utils.chat_templates.html#functions",
+    "href": "docs/api/utils.chat_templates.html#functions",
+    "title": "utils.chat_templates",
     "section": "",
-    "text": "core.trainers.mixins.sequence_parallel\nModule for Axolotl trainer sequence parallelism mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSequenceParallelMixin\nMixin class for sequence parallelism support in trainers.\n\n\n\n\n\ncore.trainers.mixins.sequence_parallel.SequenceParallelMixin()\nMixin class for sequence parallelism support in trainers.\nThis mixin provides functionality for handling sequence parallelism,\nspecifically for creating appropriate data samplers."
+    "text": "Name\nDescription\n\n\n\n\nget_chat_template\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\nregister_chat_template\nRegisters chat templates.\n\n\n\n\n\nutils.chat_templates.get_chat_template(\n    user_choice,\n    jinja_template=None,\n    tokenizer=None,\n)\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nuser_choice\nstr\nThe user’s choice of template.\nrequired\n\n\njinja_template\nOptional[str]\nThe jinja template string. Defaults to None.\nNone\n\n\ntokenizer\nOptional[PreTrainedTokenizerBase]\nThe tokenizer. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nstr\nstr\nThe chosen template string.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the user_choice is not found in the templates.\n\n\n\n\n\n\n\nutils.chat_templates.register_chat_template(template_name, chat_template)\nRegisters chat templates.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntemplate_name\nstr\nThe name of the template.\nrequired\n\n\nchat_template\nstr\nThe template string.\nrequired"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.sequence_parallel.html#classes",
-    "href": "docs/api/core.trainers.mixins.sequence_parallel.html#classes",
-    "title": "core.trainers.mixins.sequence_parallel",
+    "objectID": "docs/api/utils.model_shard_quant.html",
+    "href": "docs/api/utils.model_shard_quant.html",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSequenceParallelMixin\nMixin class for sequence parallelism support in trainers.\n\n\n\n\n\ncore.trainers.mixins.sequence_parallel.SequenceParallelMixin()\nMixin class for sequence parallelism support in trainers.\nThis mixin provides functionality for handling sequence parallelism,\nspecifically for creating appropriate data samplers."
+    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "title": "prompt_strategies.dpo.zephyr",
+    "objectID": "docs/api/utils.model_shard_quant.html#functions",
+    "href": "docs/api/utils.model_shard_quant.html#functions",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
+    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/kernels.quantize.html",
-    "href": "docs/api/kernels.quantize.html",
-    "title": "kernels.quantize",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "kernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
+    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/api/kernels.quantize.html#functions",
-    "href": "docs/api/kernels.quantize.html#functions",
-    "title": "kernels.quantize",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
+    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html",
-    "href": "docs/api/cli.cloud.base.html",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/monkeypatch.unsloth_.html",
+    "href": "docs/api/monkeypatch.unsloth_.html",
+    "title": "monkeypatch.unsloth_",
     "section": "",
-    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "monkeypatch.unsloth_\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations"
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html#classes",
-    "href": "docs/api/cli.cloud.base.html#classes",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "title": "prompt_strategies.dpo.passthrough",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "title": "prompt_strategies.alpaca_instruct",
+    "objectID": "docs/api/core.training_args.html",
+    "href": "docs/api/core.training_args.html",
+    "title": "core.training_args",
     "section": "",
-    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
+    "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n    simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args."
   },
   {
-    "objectID": "docs/api/core.chat.format.llama3x.html",
-    "href": "docs/api/core.chat.format.llama3x.html",
-    "title": "core.chat.format.llama3x",
+    "objectID": "docs/api/core.training_args.html#classes",
+    "href": "docs/api/core.training_args.html#classes",
+    "title": "core.training_args",
     "section": "",
-    "text": "core.chat.format.llama3x\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n    simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args."
   },
   {
-    "objectID": "docs/api/core.trainers.utils.html",
-    "href": "docs/api/core.trainers.utils.html",
-    "title": "core.trainers.utils",
+    "objectID": "docs/api/cli.vllm_serve.html",
+    "href": "docs/api/cli.vllm_serve.html",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
+    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
   },
   {
-    "objectID": "docs/faq.html",
-    "href": "docs/faq.html",
-    "title": "FAQ",
+    "objectID": "docs/api/cli.vllm_serve.html#functions",
+    "href": "docs/api/cli.vllm_serve.html#functions",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: Exitcode -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: Exitcode -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\nQ: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?\n\nA: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_&lt;model_name&gt;.py file within transformers library.\n\nQ: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token\n\nA: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:\n\n\nspecial_tokens:\n  # str. If you're not sure, set to same as `eos_token`.\n  pad_token: \"...\"\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS token is incorrectly being masked or not being masked / EOS token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThis is because of the mismatch between tokenizer.eos_token and EOS token in template. Please make sure to set eos_token: under special_tokens: to the same EOS token as in template.\n\n\n\n\nThe EOS token is not in the template. Please check if your template is correct. As an example, phi_35 template does not use its dedicated EOS token &lt;|endoftext|&gt; at the end.\n\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.\n\nQ: The EOT token(s) are incorrectly being masked or not being masked / EOT token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThe EOT token is different from the EOS token and was not specified under eot_tokens:. Please set eot_tokens: to the same EOT token(s) as in template.\n\n\n\n\nThere is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.\n\n\nQ: EOT token encoding failed. Please check if the token is valid and can be encoded.\n\nA: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.\n\nQ: EOT token __ is encoded as multiple tokens.\n\nA: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:.\n\nQ: Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot\n\nA: This is because the EOS token is in the eot_tokens: while mismatch between train_on_eos: and train_on_eot:. This will cause one to override the other. Please ensure that train_on_eos: and train_on_eot: are the same or remove the EOS token from eot_tokens:.\n\nQ: If eot_tokens: is not provided, what happens?\n\nA: If eot_tokens: is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.\n\n\nInternally, eot_tokens: tokenizer.eos_token and train_on_eot: train_on_eos (which defaults to turn). This transition helps clarify the naming and behavior of EOT/EOS tokens.",
+    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+  },
+  {
+    "objectID": "docs/api/core.chat.format.shared.html",
+    "href": "docs/api/core.chat.format.shared.html",
+    "title": "core.chat.format.shared",
+    "section": "",
+    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
+  },
+  {
+    "objectID": "docs/api/kernels.geglu.html",
+    "href": "docs/api/kernels.geglu.html",
+    "title": "kernels.geglu",
+    "section": "",
+    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+  },
+  {
+    "objectID": "docs/api/kernels.geglu.html#functions",
+    "href": "docs/api/kernels.geglu.html#functions",
+    "title": "kernels.geglu",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html",
+    "title": "prompt_strategies.dpo.chatml",
+    "section": "",
+    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+  },
+  {
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "title": "prompt_strategies.dpo.chatml",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+  },
+  {
+    "objectID": "docs/reward_modelling.html",
+    "href": "docs/reward_modelling.html",
+    "title": "Reward Modelling",
+    "section": "",
+    "text": "Overview\nReward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions.\nWe support the reward modelling techniques supported by trl.\n\n\n(Outcome) Reward Models\nOutcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).\nbase_model: google/gemma-2-2b\nmodel_type: AutoModelForSequenceClassification\nnum_labels: 1\ntokenizer_type: AutoTokenizer\n\nreward_model: true\nchat_template: gemma\ndatasets:\n  - path: argilla/distilabel-intel-orca-dpo-pairs\n    type: bradley_terry.chat_template\n\nval_set_size: 0.1\neval_steps: 100\nBradley-Terry chat templates expect single-turn conversations in the following format:\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nProcess Reward Models (PRM)\n\n\n\n\n\n\nTip\n\n\n\nCheck out our PRM blog.\n\n\nProcess reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.\nbase_model: Qwen/Qwen2.5-3B\nmodel_type: AutoModelForTokenClassification\nnum_labels: 2\n\nprocess_reward_model: true\ndatasets:\n  - path: trl-lib/math_shepherd\n    type: stepwise_supervised\n    split: train\n\nval_set_size: 0.1\neval_steps: 100\nPlease see stepwise_supervised for more details on the dataset format.",
+    "crumbs": [
+      "How To Guides",
+      "Reward Modelling"
+    ]
+  },
+  {
+    "objectID": "docs/config.html",
+    "href": "docs/config.html",
+    "title": "Config Reference",
+    "section": "",
+    "text": "# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files\n# This can also be a relative path to a model on disk\nbase_model: ./llama-7b-hf\n# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)\nbase_model_ignore_patterns:\n# If the base_model repo on hf hub doesn't include configuration .json files,\n# You can set that here, or leave this empty to default to base_model\nbase_model_config: ./llama-7b-hf\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model:\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config:\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too\nmodel_type: AutoModelForCausalLM\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: AutoTokenizer\n# Trust remote code for untrusted source\ntrust_remote_code:\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast:\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy:\n# Resize the model embeddings when new tokens are added to multiples of 32\n# This is reported to improve training speed on some models\nresize_token_embeddings_to_32x:\n# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings:\n# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast:\n# Whether to load the model with randomly initialized weights. Useful for\n# pre-training a model from scratch or debugging purposes.\nrandom_init_weights:\n\n# (Internal use only)\n# Used to identify which the model is based on\nis_falcon_derived_model:\nis_llama_derived_model:\nis_qwen_derived_model:\n# Please note that if you set this to true, `padding_side` will be set to \"left\" by default\nis_mistral_derived_model:\n\n# optional overrides to the base model configuration\noverrides_of_model_config:\n  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653\n  rope_scaling:\n    type: # linear | dynamic\n    factor: # float\n\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs:\n  # use_cache: False\n\n# optional overrides to the bnb 4bit quantization configuration\n# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig\nbnb_config_kwargs:\n  # These are default values\n  llm_int8_has_fp16_weight: false\n  bnb_4bit_quant_type: nf4\n  bnb_4bit_use_double_quant: true\n\n\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: true\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: true\n# Use bitsandbytes 4 bit\nload_in_4bit:\n\n# Use CUDA bf16\nbf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require &gt;=ampere\n# Use CUDA fp16\nfp16: true\n# Use CUDA tf32\ntf32: true # require &gt;=ampere\n# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting\n\n# No AMP (automatic mixed precision)\nbfloat16: true # require &gt;=ampere\nfloat16: true\n\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset\ngpu_memory_limit: 20GiB\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: true\n\n# List[str]. Add plugins to extend the pipeline.\n# See `src/axolotl/integrations` for the available plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins:\n  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n# A list of one or more datasets to finetune the model with\ndatasets:\n  # HuggingFace dataset repo | s3://,gs:// path | \"json\" for local dataset, make sure to fill data_files\n  - path: vicgalle/alpaca-gpt4\n    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n    type: alpaca # format | format:&lt;prompt_style&gt; (chat/instruct) | &lt;prompt_strategies&gt;.load_&lt;load_fn&gt;\n    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file\n    data_files: # Optional[str] path to source data files\n\n    shards: # Optional[int] split dataset into N pieces (use with shards_idx)\n    shards_idx: # Optional[int] = 0 the index of sharded dataset to use\n\n    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)\n\n    name: # Optional[str] name of dataset configuration to load\n    split: train # Optional[str] name of dataset split to load from\n    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.\n    trust_remote_code: # Optional[bool] Trust remote code for untrusted source\n\n  # Custom user instruction prompt\n  - path: repo\n    type:\n      # The below are defaults. only set what's needed if you use a different column name.\n      system_prompt: \"\"\n      system_format: \"{system}\"\n      field_system: system\n      field_instruction: instruction\n      field_input: input\n      field_output: output\n\n      # Customizable to be single line or multi-line\n      # Use {instruction}/{input} as key to be replaced\n      # 'format' can include {input}\n      format: |-\n        User: {instruction} {input}\n        Assistant:\n      # 'no_input_format' cannot include {input}\n      no_input_format: \"{instruction} \"\n\n      # For `completion` datsets only, uses the provided field instead of `text` column\n      field:\n\n  # Using chat template\n  - path: ...\n    # Set type to `chat_template` to use this strategy\n    type: chat_template\n    # Specify the name of the chat template to use\n    # The name of the chat template to use for training, following values are supported:\n    # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.\n    # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n    # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.\n    # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n    chat_template: tokenizer_default\n\n    # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n    chat_template_jinja:\n\n    # Key containing the messages (default: \"messages\")\n    field_messages: messages\n\n    # Key containing the system message (default: \"system\")\n    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.\n    field_system: system\n\n    # Mapping of properties from the input dataset to the chat template.\n    # (default: message_property_mappings={'role':'role', 'content':'content'})\n    # If a property exists in the template but not in this mapping, the system will attempt\n    # to load it directly from the message using the property name as the key.\n    # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',\n    # while 'value' is loaded and used as 'content' in the chat template.\n    message_property_mappings:\n      role: from\n      content: value\n      # ...\n\n    # Optional[Dict[str, List]]. Roles mapping in the messages.\n    # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.\n    # The default is:\n    roles:\n      user: [\"human\", \"user\"]\n      assistant: [\"gpt\", \"assistant\"]\n      system: [\"system\"]\n      tool: [\"tool\"]\n\n    # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.\n    # This does not drop the default system message from chat_template if it exists. If you wish to,\n    # we recommend using a custom jinja template with the default system message removed or\n    # adding a system turn with empty content.\n    drop_system_message:\n\n    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags\n    # See example at `docs/dataset-formats/conversation.qmd`\n    split_thinking:\n\n    # IMPORTANT: The following fields determine which parts of the conversation to train on.\n    # Priority order: message_field_training &gt; message_field_training_detail &gt; train_on_inputs or role in roles_to_train\n    # See examples at `docs/dataset-formats/conversation.qmd`\n    # Note: If the below 5 fields are empty, defaults to training only on the last message.\n\n    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.\n    roles_to_train: [\"assistant\"]  # default\n    # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:\n    # - all: train on all EOS tokens\n    # - turn (default): train on the EOS token at the end of each trainable turn\n    # - last: train on the last EOS token in the conversation\n    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.\n    train_on_eos: turn\n    # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:\n    # - all: train on all EOT tokens\n    # - turn: train on the EOT token at the end of each trainable turn\n    # - last: train on the last EOT token in the conversation\n    # If not specified, defaults to the value of train_on_eos for backward compatibility.\n    train_on_eot:\n    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.\n    message_field_training: training\n    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.\n    # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).\n    message_field_training_detail: train_detail\n\n\n# If false, the datasets will not be shuffled and will keep their original order in `datasets`.\n# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: true\n\nDeduplicates datasets and test_datasets with identical entries.\ndataset_exact_deduplication: true\n\n# A list of one or more datasets to eval the model with.\n# You can use either test_datasets, or val_set_size, but not both.\ntest_datasets:\n  - path: /workspace/data/eval.jsonl\n    ds_type: json\n    # You need to specify a split. For \"json\" datasets the default split is called \"train\".\n    split: train\n    type: completion\n    data_files:\n      - /workspace/data/eval.jsonl\n\n# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl:\nrl_beta:  # Optional[float]. The beta parameter for the RL training.\n\n# dpo\ndpo_use_weighting:  # Optional[bool]. Whether to perform weighting.\nrpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.\n\n# orpo\norpo_alpha: 0.1  # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.\n\n# kto\nkto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.\nkto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.\n\n# simpo\ncpo_alpha: 1.0  # Weight of the BC regularizer\nsimpo_gamma: 0.5  # Target reward margin for the SimPO loss\n\n# grpo\ntrl:\n  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.\n  vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.\n  vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.\n  vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.\n\n  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use\n  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.\n\n  reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.\n  reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.\n\n  num_generations: # Optional[int]. Number of generations to sample.\n  log_completions: # Optional[bool]. Whether to log completions.\n\n  sync_ref_model: # Optional[bool]. Whether to sync the reference model.\n  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.\n  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.\n\n\n# reward modelling: `True` or `False`\nreward_model:\n\n# process reward modelling: `True` or `False`\nprocess_reward_model:\n\n# The name of the chat template to use for training, following values are supported:\n# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.\n# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.\n# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n# The selected chat template will be saved to the tokenizer_config.json for easier inferencing\n# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.\nchat_template: tokenizer_default\n# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.\nchat_template_jinja: null\n# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.\n# These tokens mark the boundaries between conversation turns.\n# For example: [\"/INST\", \"&lt;/s&gt;\", \"[/SYSTEM_PROMPT]\"]\n# If not specified, defaults to just the model's eos_token.\n# This is useful for templates that use multiple delimiter tokens.\neot_tokens:\n  # - \"&lt;/s&gt;\"\n  # - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n# Changes the default system message\ndefault_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: data/last_run_prepared\n# Push prepared dataset to hub\npush_dataset_to_hub: # Optional[str] repo_org/repo_name\n# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`\n# if not set.\ndataset_processes: # defaults to os.cpu_count() if not set\n# Keep dataset in memory while preprocessing\n# Only needed if cached dataset is taking too much storage\ndataset_keep_in_memory:\n# push checkpoints to hub\nhub_model_id: # private repo path to push finetuned model\n# how to push checkpoints to hub\n# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy\nhub_strategy:\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets\n# Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: # boolean\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.\nval_set_size: 0.04\n# Num shards for whole dataset\ndataset_shard_num:\n# Index of shard to use for whole dataset\ndataset_shard_idx:\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: 2048\n# Pad inputs so each step uses constant sized buffers\n# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len:\n# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'\nsample_packing:\n# Set to 'false' if getting errors during eval with sample_packing on.\neval_sample_packing:\n# You can set these packing optimizations AFTER starting a training at least once.\n# The trainer will provide recommended values for these values.\nsample_packing_eff_est:\ntotal_num_tokens:\n# Increasing the following values helps with packing, but usually only slightly (&lt;%1.)\n# The number of samples packed at a time.\nsample_packing_group_size: 100000\n# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.\nsample_packing_bin_size: 200\nsample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.\n\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation:\n\ncurriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening:\n\n# Passed through to transformers when loading the model when launched without accelerate\n# Use `sequential` when training w/ model parallelism to limit memory\ndevice_map:\n# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.\nmax_memory:\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model\nadapter: lora\n# If you already have a lora model trained that you want to load, put that here.\n# This means after training, if you want to test the model, you should set this to the value of `output_dir`.\n# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir:\n\n# LoRA hyperparameters\n# For more details about the following options, see:\n# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2\nlora_r: 8\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_modules:\n  - q_proj\n  - v_proj\n#  - k_proj\n#  - o_proj\n#  - gate_proj\n#  - down_proj\n#  - up_proj\nlora_target_linear: # If true, will target all linear modules\n\n# List[int] | int. # The layer indices to transform, otherwise, apply to all layers\n# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform\npeft_layers_to_transform:\n\n# Optional[bool]. Whether to use DoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora\npeft_use_dora:\n\n# Optional[bool]. Whether to use RSLoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora\npeft_use_rslora:\n\n# Optional[list[tuple[int, int]]]. List of layer indices to replicate.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora\npeft_layer_replication:\n\n# bool | Literal[\"gaussian\", \"eva\", \"olora\", \"pissa\", \"pissa_niter_[number of iters]\", \"corda\", \"loftq\"]\n# How to initialize LoRA weights. Default to True which is MS original implementation.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization\npeft_init_lora_weights:\n\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.\n# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\n# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994\nlora_modules_to_save:\n#  - embed_tokens\n#  - lm_head\n\nlora_fan_in_fan_out: false\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for\n# speed and memory savings\n# See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n# LoRA+ hyperparameters\n# For more details about the following options, see:\n# https://arxiv.org/abs/2402.12354  and `src/axolotl/core/train_builder.py`\nloraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_embedding: #  loraplus learning rate for lora embedding layers. Default value is 1e-6.\n\npeft:\n  # Configuration options for loftq initialization for LoRA\n  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization\n  loftq_config:\n    loftq_bits:  # typically 4 bits\n\n# ReLoRA configuration\n# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed\nrelora_steps: # Number of steps per ReLoRA restart\nrelora_warmup_steps: # Number of per-restart warmup steps\nrelora_anneal_steps: # Number of anneal steps for each relora cycle\nrelora_prune_ratio: # threshold for optimizer magnitude when pruning\nrelora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings\n\n# wandb configuration if you're using it\n# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.\nwandb_mode: # \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn off wandb\nwandb_project: # Your wandb project name\nwandb_entity: # A wandb Team name if using a Team\nwandb_watch:\nwandb_name: # Set the name of your wandb run\nwandb_run_id: # Set the ID of your wandb run\nwandb_log_model: # \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only at the end of training\n\n# mlflow configuration if you're using it\nmlflow_tracking_uri: # URI to mlflow\nmlflow_experiment_name: # Your experiment name\nmlflow_run_name: # Your run name\nhf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry\n\n# Comet configuration if you're using it\n# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.\n# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start\nuse_comet: # Enable or disable Comet integration.\ncomet_api_key: # API key for Comet. Recommended to set via `comet login`.\ncomet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.\ncomet_project_name: # Project name in Comet. Defaults to Uncategorized.\ncomet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.\ncomet_mode: # Create a new experiment (\"create\") or log to an existing one (\"get\"). Default (\"get_or_create\") auto-selects based on configuration.\ncomet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.\ncomet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.\n\n# Tensorboard\nuse_tensorboard: # Optional[bool]\n\n# Where to save the full-finetuned model to\noutput_dir: ./completed-model\n\n# Whether to use torch.compile and which backend to use\n# setting to `auto` will enable torch compile when torch&gt;=2.5.1\ntorch_compile:  # Optional[Union[Literal[\"auto\"], bool]]\ntorch_compile_backend:  # Optional[str]\n\n# Training hyperparameters\n\n# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.\ngradient_accumulation_steps: 1\n# The number of samples to include in each batch. This is the number of samples sent to each GPU.\n# Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: 2\neval_batch_size:\nnum_epochs: 4\nwarmup_steps: 100  # cannot use with warmup_ratio\nwarmup_ratio: 0.05  # cannot use with warmup_steps\nlearning_rate: 0.00003\nlr_quadratic_warmup:\nlogging_steps:\neval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps\nevals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps\neval_strategy: # Set to `\"no\"` to skip evaluation, `\"epoch\"` at end of each epoch, leave empty to infer from `eval_steps`.\nsave_strategy: # Set to `\"no\"` to skip checkpoint saves, `\"epoch\"` at end of each epoch, `\"best\"` when better result is achieved, leave empty to infer from `save_steps`.\nsave_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps\nsaves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsave_total_limit: # Checkpoints saved at a time\nsave_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.\n# Maximum number of iterations to train for. It precedes num_epochs which means that\n# if both are set, num_epochs will not be guaranteed.\n# e.g., when 1 epoch is 1000 steps =&gt; `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps:\n\n# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: # Optional[bool]\n\n# whether to find batch size that fits in memory. Passed to underlying transformers Trainer\nauto_find_batch_size: # Optional[bool]\n\neval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0\neval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128\ndo_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.\neval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is [\"sacrebleu\", \"comet\", \"ter\", \"chrf\", \"perplexity\"]\n\nprofiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.\n                # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information\n                # snapshots can be visualized @ https://pytorch.org/memory_viz\n\nloss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)\nloss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)\n\n# Save model as safetensors (require safetensors package)\nsave_safetensors:\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: false\n# Group similarly sized data to minimize padding.\n# May be slower to start, as it must download and sort the entire dataset.\n# Note that training loss may have an oscillating pattern with this enabled.\ngroup_by_length: false\n\n# Whether to use gradient checkpointing. Available options are: true, false, \"offload\", \"offload_disk\".\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: false\n# additional kwargs to pass to the trainer for gradient checkpointing\n# gradient_checkpointing_kwargs:\n#   use_reentrant: true\n\n# Stop training after this many evaluation losses have increased in a row\n# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback\nearly_stopping_patience: 3\n\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine\nlr_scheduler_kwargs:\ncosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr\ncosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)\n\n# For one_cycle optim\nlr_div_factor: # Learning rate div factor\n\n# Specify optimizer\n# Valid values are driven by the Transformers OptimizerNames class, see:\n# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189\n#\n# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of\n# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used\n# in the examples/ for your model and fine-tuning use case.\n#\n# Valid values for 'optimizer' include:\n# - adamw_torch\n# - adamw_torch_fused\n# - adamw_torch_xla\n# - adamw_torch_npu_fused\n# - adamw_apex_fused\n# - adopt_adamw  (an EXPERIMENTAL optimizer, only for torch version &gt;= 2.5.1)\n# - adafactor\n# - adamw_anyprecision\n# - adamw_torch_4bit\n# - ademamix\n# - sgd\n# - adagrad\n# - adamw_bnb_8bit\n# - adamw_8bit   # alias for adamw_bnb_8bit\n# - ademamix_8bit\n# - lion_8bit\n# - lion_32bit\n# - paged_adamw_32bit\n# - paged_adamw_8bit\n# - paged_ademamix_32bit\n# - paged_ademamix_8bit\n# - paged_lion_32bit\n# - paged_lion_8bit\n# - rmsprop\n# - rmsprop_bnb\n# - rmsprop_bnb_8bit\n# - rmsprop_bnb_32bit\n# - galore_adamw\n# - galore_adamw_8bit\n# - galore_adafactor\n# - galore_adamw_layerwise\n# - galore_adamw_8bit_layerwise\n# - galore_adafactor_layerwise\n# - lomo\n# - adalomo\n# - grokadamw\n# - schedule_free_adamw\n# - schedule_free_sgd\n# - apollo_adamw\n# - apollo_adamw_layerwise\n#\n# Additional custom optimizers include:\n# - optimi_adamw\n# - ao_adamw_8bit\n# - ao_adamw_fp8\n# - came_pytorch\noptimizer:\n# Dictionary of arguments to pass to the optimizer\noptim_args:\n# For Galore Optimizers the following optim_args are available\n# rank:  # type: int\n# update_proj_gap  # type: int\n# scale  # type: float\n# proj_type:  # type: str, default = std\n\n# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm\noptim_target_modules:\n# - self_attn  # for llama\n# - mlp\n\n# Specify weight decay\nweight_decay:\n# adamw hyperparams\nadam_beta1:\nadam_beta2:\nadam_epsilon:\n# Gradient clipping max norm\nmax_grad_norm:\n\n# Augmentation techniques\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings\n# currently only supported on Llama and Mistral\nneftune_noise_alpha:\n\n# Optional[bool]. Whether to bettertransformers\nflash_optimum:\n\n# Note: Only one of the following attention patches can be used at a time.\n# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.\n\n# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:\nxformers_attention:\n# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:\nflash_attention:\nflash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation\nflash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation\n# Optional[bool]. Whether to use scaled-dot-product attention\n# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention:\n# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention:\n\n# Optional[bool]. Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage:\n# Optional[str]. Resume from a specific checkpoint dir\nresume_from_checkpoint:\n# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: false\n\n## Multimodal section\n# int | tuple[int, int] | None . Size to resize images to, width x height.\n# Will read from model/processor config if not set.\nimage_size:\n# str. Algorithm to use for image resizing. \"bilinear\", \"bicubic\", \"lanczos\". Default is \"bilinear\".\nimage_resize_algorithm: 'bilinear'\n## End of multimodal section\n\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank:\n\n# Add or change special tokens.\n# If you add tokens here, you don't need to add them to the `tokens` list.\nspecial_tokens:\n  # bos_token: \"&lt;s&gt;\"\n  # eos_token: \"&lt;/s&gt;\"\n  # unk_token: \"&lt;unk&gt;\"\n  # pad_token: \"[PAD]\"\n\n# Optional[list[str]]. Add extra tokens to the tokenizer.\ntokens:\n  # - \"&lt;|startoftext|&gt;\"\n  # - \"&lt;|endoftext|&gt;\"\n\n# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.\n# Only works for tokens that are not part of the base vocab (aka are added_tokens).\n# Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides:  # Dict[int, str]\n#  128041: \"&lt;|im_start|&gt;\"\n#  128042: \"&lt;|im_end|&gt;\"\n\n# FSDP\nfsdp:\nfsdp_config:\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed:\n\n# Advanced DDP Arguments\nddp_timeout:\nddp_bucket_cap_mb:\nddp_broadcast_buffers:\n\n# Sequence parallelism\n# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.\n# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.\n# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized\n# subsequences, or set to 4 to split into four equal-sized subsequences.\n# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.\nsequence_parallel_degree:\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\n# Must evenly divide the number of KV heads in your model.\nheads_k_stride: 1\n# One of \"varlen_llama3\", \"batch_ring\", \"batch_zigzag\", \"batch_stripe\". Defaults to \"varlen_llama3\"\n# in the sample packing case, and \"batch_ring\" in the non-sample packing case.\nring_attn_func:\n\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path:\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset:\n\n# Debug mode\ndebug:\n\n# Seed\nseed:\n\n# Allow overwrite yml config using from cli\nstrict:",
+    "crumbs": [
+      "Getting Started",
+      "Config Reference"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html",
+    "href": "docs/dataset_loading.html",
+    "title": "Dataset Loading",
+    "section": "",
+    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html#overview",
+    "href": "docs/dataset_loading.html#overview",
+    "title": "Dataset Loading",
+    "section": "",
+    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html#loading-datasets",
+    "href": "docs/dataset_loading.html#loading-datasets",
+    "title": "Dataset Loading",
+    "section": "Loading Datasets",
+    "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n  - path:\n    name:\n    data_files:\n    split:\n    revision:\n    trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n  - path: /path/to/your/dataset\n  - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nUsually, to load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n  - path: json\n    data_files: /path/to/your/file.jsonl\nHowever, to make things easier, we have added a few shortcuts for loading local dataset files.\nYou can just point the path to the file or directory along with the ds_type to load the dataset. The below example shows for a JSON file:\ndatasets:\n  - path: /path/to/your/file.jsonl\n    ds_type: json\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n  - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n    # single file\n  - path: /path/to/your/directory\n    ds_type: csv\n    data_files: file1.csv\n\n    # multiple files\n  - path: /path/to/your/directory\n    ds_type: json\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n    # multiple files for parquet\n  - path: /path/to/your/directory\n    ds_type: parquet\n    data_files:\n      - file1.parquet\n      - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n  - path: org/dataset-name\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n  - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n    # Single file\n  - path: s3://bucket-name/path/to/your/file.jsonl\n\n    # Directory\n  - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n  - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html#next-steps",
+    "href": "docs/dataset_loading.html#next-steps",
+    "title": "Dataset Loading",
+    "section": "Next steps",
+    "text": "Next steps\nNow that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format dataset formats docs.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html",
+    "href": "docs/multi-gpu.html",
+    "title": "Multi-GPU",
+    "section": "",
+    "text": "This guide covers advanced training configurations for multi-GPU setups using Axolotl.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-overview",
+    "href": "docs/multi-gpu.html#sec-overview",
+    "title": "Multi-GPU",
+    "section": "1 Overview",
+    "text": "1 Overview\nAxolotl supports several methods for multi-GPU training:\n\nDeepSpeed (recommended)\nFSDP (Fully Sharded Data Parallel)\nSequence parallelism\nFSDP + QLoRA",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-deepspeed",
+    "href": "docs/multi-gpu.html#sec-deepspeed",
+    "title": "Multi-GPU",
+    "section": "2 DeepSpeed",
+    "text": "2 DeepSpeed\nDeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.\n\n2.1 Configuration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\n2.2 Usage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\n2.3 ZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-fsdp",
+    "href": "docs/multi-gpu.html#sec-fsdp",
+    "title": "Multi-GPU",
+    "section": "3 FSDP",
+    "text": "3 FSDP\n\n3.1 Basic FSDP Configuration\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-sequence-parallelism",
+    "href": "docs/multi-gpu.html#sec-sequence-parallelism",
+    "title": "Multi-GPU",
+    "section": "4 Sequence parallelism",
+    "text": "4 Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nFirst, install ring-flash-attn, recommended via pip install axolotl[ring-flash-attn],\nor from source with pip install .[ring-flash-attn].\nYour Axolotl YAML config should contain the following lines:\nsequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU\nflash_attention: true  # Required with sequence parallelism\n\n# Optional; strides across the key dimension. Larger values use more memory but will make training faster.\nheads_k_stride: 1\nSee our dedicated guide for more details.\n\n4.1 FSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-performance",
+    "href": "docs/multi-gpu.html#sec-performance",
+    "title": "Multi-GPU",
+    "section": "5 Performance Optimization",
+    "text": "5 Performance Optimization\n\n5.1 Liger Kernel Integration\nPlease see docs for more info.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-troubleshooting",
+    "href": "docs/multi-gpu.html#sec-troubleshooting",
+    "title": "Multi-GPU",
+    "section": "6 Troubleshooting",
+    "text": "6 Troubleshooting\n\n6.1 NCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\n6.2 Common Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html",
+    "href": "docs/installation.html",
+    "title": "Installation",
+    "section": "",
+    "text": "This guide covers all the ways you can install and set up Axolotl for your environment.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-requirements",
+    "href": "docs/installation.html#sec-requirements",
+    "title": "Installation",
+    "section": "1 Requirements",
+    "text": "1 Requirements\n\nNVIDIA GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU\nPython ≥3.10\nPyTorch ≥2.4.1",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-installation-methods",
+    "href": "docs/installation.html#sec-installation-methods",
+    "title": "Installation",
+    "section": "2 Installation Methods",
+    "text": "2 Installation Methods\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure to have Pytorch installed before installing Axolotl in your local environment.\nFollow the instructions at: https://pytorch.org/get-started/locally/\n\n\n\n2.1 PyPI Installation (Recommended)\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nWe use --no-build-isolation in order to detect the installed PyTorch version (if\ninstalled) in order not to clobber it, and so that we set the correct version of\ndependencies that are specific to the PyTorch version or other installed\nco-dependencies.\n\n\n2.2 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\n2.3 Docker\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n  --name axolotl --ipc=host \\\n  --ulimit memlock=-1 --ulimit stack=67108864 \\\n  --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n  axolotlai/axolotl:main-latest\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-cloud",
+    "href": "docs/installation.html#sec-cloud",
+    "title": "Installation",
+    "section": "3 Cloud Environments",
+    "text": "3 Cloud Environments\n\n3.1 Cloud GPU Providers\nFor providers supporting Docker:\n\nUse axolotlai/axolotl-cloud:main-latest\nAvailable on:\n\nLatitude.sh\nJarvisLabs.ai\nRunPod\nNovita\n\n\n\n\n3.2 Google Colab\nUse our example notebook.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-platform-specific",
+    "href": "docs/installation.html#sec-platform-specific",
+    "title": "Installation",
+    "section": "4 Platform-Specific Instructions",
+    "text": "4 Platform-Specific Instructions\n\n4.1 macOS\npip3 install --no-build-isolation -e '.'\nSee Section 6 for Mac-specific issues.\n\n\n4.2 Windows\n\n\n\n\n\n\nImportant\n\n\n\nWe recommend using WSL2 (Windows Subsystem for Linux) or Docker.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-env-managers",
+    "href": "docs/installation.html#sec-env-managers",
+    "title": "Installation",
+    "section": "5 Environment Managers",
+    "text": "5 Environment Managers\n\n5.1 Conda/Pip venv\n\nInstall Python ≥3.10\nInstall PyTorch: https://pytorch.org/get-started/locally/\nInstall Axolotl:\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n(Optional) Login to Hugging Face:\nhuggingface-cli login",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-troubleshooting",
+    "href": "docs/installation.html#sec-troubleshooting",
+    "title": "Installation",
+    "section": "6 Troubleshooting",
+    "text": "6 Troubleshooting\nIf you encounter installation issues, see our FAQ and Debugging Guide.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/docker.html",
+    "href": "docs/docker.html",
+    "title": "Docker",
+    "section": "",
+    "text": "This section describes the different Docker images that are released by AxolotlAI at Docker Hub.",
+    "crumbs": [
+      "Deployments",
+      "Docker"
+    ]
+  },
+  {
+    "objectID": "docs/docker.html#base",
+    "href": "docs/docker.html#base",
+    "title": "Docker",
+    "section": "Base",
+    "text": "Base\nThe base image is the most minimal image that can install Axolotl. It is based on the nvidia/cuda image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.\n\nImage\naxolotlai/axolotl-base\nLink: Docker Hub\n\n\nTags format\nmain-base-py{python_version}-cu{cuda_version}-{pytorch_version}\nTags examples:\n\nmain-base-py3.11-cu128-2.7.0\nmain-base-py3.11-cu126-2.7.0\nmain-base-py3.11-cu124-2.6.0\nmain-base-py3.11-cu124-2.5.1\nmain-base-py3.11-cu124-2.4.1",
+    "crumbs": [
+      "Deployments",
+      "Docker"
+    ]
+  },
+  {
+    "objectID": "docs/docker.html#main",
+    "href": "docs/docker.html#main",
+    "title": "Docker",
+    "section": "Main",
+    "text": "Main\nThe main image is the image that is used to run Axolotl. It is based on the axolotlai/axolotl-base image and includes the Axolotl codebase, dependencies, and more.\n\nImage\naxolotlai/axolotl\nLink: Docker Hub\n\n\nTags format\n# on push to main\nmain-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)\nmain-latest\n\n# nightly build\n{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# tagged release\n{version}\n\n\n\n\n\n\nTip\n\n\n\nThere may be some extra tags appended to the image, like -vllm which installs those packages.\n\n\nTags examples:\n\nmain-py3.11-cu126-2.7.0\nmain-py3.11-cu124-2.6.0\nmain-py3.11-cu124-2.5.1\nmain-py3.11-cu124-2.4.1\nmain-latest\nmain-20250303-py3.11-cu124-2.6.0\nmain-20250303-py3.11-cu124-2.5.1\nmain-20250303-py3.11-cu124-2.4.1\n0.7.1",
+    "crumbs": [
+      "Deployments",
+      "Docker"
+    ]
+  },
+  {
+    "objectID": "docs/docker.html#cloud",
+    "href": "docs/docker.html#cloud",
+    "title": "Docker",
+    "section": "Cloud",
+    "text": "Cloud\nThe cloud image is the image that is used to run Axolotl in the cloud. It is based on the axolotlai/axolotl image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.\n\n\n\n\n\n\nTip\n\n\n\nJupyter lab is run by default. Set JUPYTER_DISABLE=1 in the environment variables to disable it.\n\n\n\nImage\naxolotlai/axolotl-cloud\nLink: Docker Hub\n\n\nTags format\nThis uses the same tags as the main image.\n\n\nEnvironment variables\n\nJUPYTER_DISABLE: Disable Jupyter lab.\nJUPYTER_PASSWORD: Set a password for the Jupyter lab.\nPUBLIC_KEY / SSH_KEY: Add a public key for the SSH service.\n\n\n\nVolume mounts\n\n\n\n\n\n\nTip\n\n\n\nWe recommend mounting volumes to /workspace/data for data persistence. /workspace/axolotl contains the source code and is ephemeral.\n\n\n\n/workspace/data/axolotl-artifacts: Directory to store Axolotl artifacts.\n/workspace/data/huggingface-cache: Directory to store HuggingFace cache.",
+    "crumbs": [
+      "Deployments",
+      "Docker"
+    ]
+  },
+  {
+    "objectID": "docs/docker.html#cloud-no-tmux",
+    "href": "docs/docker.html#cloud-no-tmux",
+    "title": "Docker",
+    "section": "Cloud-no-tmux",
+    "text": "Cloud-no-tmux\nThis is the same as the cloud image but without tmux.\n\nImage\naxolotlai/axolotl-cloud-term\nLink: Docker Hub\n\n\n\n\n\n\nNote\n\n\n\nThe naming may be a bit confusing as it has -term appended to the end.\n\n\n\n\nTags format\nThis uses the same tags as the cloud image.",
+    "crumbs": [
+      "Deployments",
+      "Docker"
+    ]
+  },
+  {
+    "objectID": "docs/multipack.html",
+    "href": "docs/multipack.html",
+    "title": "Multipack (Sample Packing)",
+    "section": "",
+    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
+    "crumbs": [
+      "Core Concepts",
+      "Multipack (Sample Packing)"
+    ]
+  },
+  {
+    "objectID": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
+    "href": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
+    "title": "Multipack (Sample Packing)",
+    "section": "",
+    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
+    "crumbs": [
+      "Core Concepts",
+      "Multipack (Sample Packing)"
+    ]
+  },
+  {
+    "objectID": "docs/multipack.html#multipack-without-flash-attention",
+    "href": "docs/multipack.html#multipack-without-flash-attention",
+    "title": "Multipack (Sample Packing)",
+    "section": "Multipack without Flash Attention",
+    "text": "Multipack without Flash Attention\nMultipack can still be achieved without Flash attention, but with lower packing\nefficiency as we are not able to join multiple batches into a single batch due to\ncontext length limits without flash attention. We can use either Pytorch’s Scaled\nDot Product Attention implementation or native Pytorch attention implementation\nalong with 4d attention masks\nto pack sequences together and avoid cross attention.",
+    "crumbs": [
+      "Core Concepts",
+      "Multipack (Sample Packing)"
+    ]
+  },
+  {
+    "objectID": "docs/debugging.html",
+    "href": "docs/debugging.html",
+    "title": "Debugging",
+    "section": "",
+    "text": "This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.",
     "crumbs": [
       "Troubleshooting",
-      "FAQ"
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/dataset_preprocessing.html",
-    "href": "docs/dataset_preprocessing.html",
-    "title": "Dataset Preprocessing",
-    "section": "",
-    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "objectID": "docs/debugging.html#table-of-contents",
+    "href": "docs/debugging.html#table-of-contents",
+    "title": "Debugging",
+    "section": "Table of Contents",
+    "text": "Table of Contents\n\nGeneral Tips\nDebugging with VSCode\n\nBackground\nConfiguration\nCustomizing your debugger\nVideo Tutorial\n\nDebugging With Docker\n\nSetup\nAttach To Container\nVideo - Attaching To Docker On Remote Host",
     "crumbs": [
-      "Core Concepts",
-      "Dataset Preprocessing"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/dataset_preprocessing.html#overview",
-    "href": "docs/dataset_preprocessing.html#overview",
-    "title": "Dataset Preprocessing",
-    "section": "",
-    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "objectID": "docs/debugging.html#general-tips",
+    "href": "docs/debugging.html#general-tips",
+    "title": "Debugging",
+    "section": "General Tips",
+    "text": "General Tips\nWhile debugging it’s helpful to simplify your test scenario as much as possible. Here are some tips for doing so:\n\n[!Important]\nAll of these tips are incorporated into the example configuration for debugging with VSCode below.\n\n\nMake sure you are using the latest version of axolotl: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from main.\nEliminate concurrency: Restrict the number of processes to 1 for both training and data preprocessing:\n\nSet CUDA_VISIBLE_DEVICES to a single GPU, ex: export CUDA_VISIBLE_DEVICES=0.\nSet dataset_processes: 1 in your axolotl config or run the training command with --dataset_processes=1.\n\nUse a small dataset: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure sample_packing: False and eval_sample_packing: False to avoid errors. If you are in a pinch and don’t have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):\ndatasets:\n    ...\n    shards: 20\nUse a small model: A good example of a small model is TinyLlama/TinyLlama-1.1B-Chat-v1.0.\nMinimize iteration time: Make sure the training loop finishes as fast as possible, with these settings.\n\nmicro_batch_size: 1\nmax_steps: 1\nval_set_size: 0\n\nClear Caches: Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging.\n\nData preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in dataset_prepared_path: in your axolotl config. If you didn’t set this value, the default is last_run_prepared.\nHF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache HuggingFace cache, by deleting the appropriate ~/.cache/huggingface/datasets/... folder(s).\nThe recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run. This is demonstrated in the example configuration below.",
     "crumbs": [
-      "Core Concepts",
-      "Dataset Preprocessing"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html",
-    "href": "docs/ray-integration.html",
-    "title": "Ray Train",
-    "section": "",
-    "text": "Axolotl supports using Ray as an alternative to accelerate for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.\nWith the --use-ray CLI flag, Axolotl will use Ray Train’s TorchTrainer to run training.",
+    "objectID": "docs/debugging.html#debugging-with-vscode",
+    "href": "docs/debugging.html#debugging-with-vscode",
+    "title": "Debugging",
+    "section": "Debugging with VSCode",
+    "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n  - path: &lt;path to your chat_template formatted dataset&gt; # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n    type: chat_template\n\n[!Important]\nIf you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip]\nIf you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n    \"version\": \"0.2.0\",\n    \"configurations\": [\n        {\n            \"name\": \"Debug axolotl prompt - chat_template\",\n            \"type\": \"python\",\n            \"module\": \"accelerate.commands.launch\",\n            \"request\": \"launch\",\n            \"args\": [\n                \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n                // The flags below simplify debugging by overriding the axolotl config\n                // with the debugging tips above.  Modify as needed.\n                \"--dataset_processes=1\",      // limits data preprocessing to one process\n                \"--max_steps=1\",              // limits training to just one step\n                \"--batch_size=1\",             // minimizes batch size\n                \"--micro_batch_size=1\",       // minimizes batch size\n                \"--val_set_size=0\",           // disables validation\n                \"--sample_packing=False\",     // disables sample packing which is necessary for small datasets\n                \"--eval_sample_packing=False\",// disables sample packing on eval set\n                \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n                \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n                ],\n            \"console\": \"integratedTerminal\",      // show output in the integrated terminal\n            \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n            \"justMyCode\": true,                   // step through only axolotl code\n            \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\",  // Since we aren't doing distributed training, we need to limit to one GPU\n                    \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n            \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n        }\n    ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip]\nYou may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n    \"version\": \"2.0.0\",\n    \"tasks\": [\n      // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n      {\n        \"label\": \"delete-outputs\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n      // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n      {\n        \"label\": \"delete-temp-hf-dataset-cache\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n        // this task combines the two tasks above\n      {\n       \"label\": \"cleanup-for-dataprep\",\n       \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n      }\n    ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode",
     "crumbs": [
-      "Deployments",
-      "Ray Train"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html#ray-cluster-setup",
-    "href": "docs/ray-integration.html#ray-cluster-setup",
-    "title": "Ray Train",
-    "section": "Ray cluster setup",
-    "text": "Ray cluster setup\nA prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs here.\nEvery Ray cluster has one head node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this doc.",
+    "objectID": "docs/debugging.html#debugging-with-docker",
+    "href": "docs/debugging.html#debugging-with-docker",
+    "title": "Debugging",
+    "section": "Debugging With Docker",
+    "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip]\nIf you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1\n\n[!Tip]\nTo understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, perform an editable install of Axolotl:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host",
     "crumbs": [
-      "Deployments",
-      "Ray Train"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html#sanity-check",
-    "href": "docs/ray-integration.html#sanity-check",
-    "title": "Ray Train",
-    "section": "Sanity check",
-    "text": "Sanity check\nTo run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:\nray status\nThe output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:\nNode status\n---------------------------------------------------------------\nActive:\n 1 head\nIdle:\n 2 4xL40S:48CPU-384GB\nPending:\n (no pending nodes)\nRecent failures:\n (no failures)\n\nResources\n---------------------------------------------------------------\nUsage:\n 0.0/96.0 CPU\n 0.0/8.0 GPU\n 0B/800.00GiB memory\n 0B/229.57GiB object_store_memory\n\nDemands:\n (no resource demands)\nYou should also be able to see the same on the Ray dashboard.",
-    "crumbs": [
-      "Deployments",
-      "Ray Train"
-    ]
-  },
-  {
-    "objectID": "docs/ray-integration.html#configuring-training-with-ray-train",
-    "href": "docs/ray-integration.html#configuring-training-with-ray-train",
-    "title": "Ray Train",
-    "section": "Configuring training with Ray Train",
-    "text": "Configuring training with Ray Train\nYou can find an example configuration at configs/llama-3/lora-1b-ray.yaml.\nThe key parameters to note here are:\nuse_ray: true\nray_num_workers: 4\n# optional\nresources_per_worker:\n    GPU: 1\n\nuse_ray: This is the flag that enables the Ray Train integration. You can either use the corresponding --use-ray flag in the CLI or set use_ray in the config file.\nray_num_workers: This is the number of workers/GPUs to use for training.\nresources_per_worker: This is the Ray resource request for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do\n\nresources_per_worker:\n    accelerator_type:L40S: 0.001",
-    "crumbs": [
-      "Deployments",
-      "Ray Train"
-    ]
-  },
-  {
-    "objectID": "docs/ray-integration.html#launching-training",
-    "href": "docs/ray-integration.html#launching-training",
-    "title": "Ray Train",
-    "section": "Launching training",
-    "text": "Launching training\nYou can simply run the following command on the head node:\naxolotl train examples/llama-3/lora-1b-ray.yml --use-ray\nThis will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.\nYou can also monitor training progress on the Ray dashboard.\nComing back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let’s say you want to make use of all 8 GPUs. You would be able to just set ray_num_workers: 8 and run the previous command. The Cluster tab will show the following:\n\n\n\nRay dashboard",
-    "crumbs": [
-      "Deployments",
-      "Ray Train"
-    ]
-  },
-  {
-    "objectID": "docs/amd_hpc.html",
-    "href": "docs/amd_hpc.html",
-    "title": "AMD GPUs on HPC Systems",
-    "section": "",
-    "text": "This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.",
-    "crumbs": [
-      "Deployments",
-      "AMD GPUs on HPC Systems"
-    ]
-  },
-  {
-    "objectID": "docs/amd_hpc.html#setup",
-    "href": "docs/amd_hpc.html#setup",
-    "title": "AMD GPUs on HPC Systems",
-    "section": "Setup",
-    "text": "Setup\n\n1. Install Python\nWe recommend using Miniforge, a minimal conda-based Python distribution:\ncurl -L -O \"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh\"\nbash Miniforge3-$(uname)-$(uname -m).sh\n\n\n2. Configure Python Environment\nAdd Python to your PATH and ensure it’s available at login:\necho 'export PATH=~/miniforge3/bin:$PATH' &gt;&gt; ~/.bashrc\necho 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' &gt;&gt; ~/.bash_profile\n\n\n3. Load AMD GPU Software\nLoad the ROCm module:\nmodule load rocm/5.7.1\nNote: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.\n\n\n4. Install PyTorch\nInstall PyTorch with ROCm support:\npip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall\n\n\n5. Install Flash Attention\nClone and install the Flash Attention repository:\ngit clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git\nexport GPU_ARCHS=\"gfx90a\"\ncd flash-attention\nexport PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')\npatch \"${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py\" hipify_patch.patch\npip install --no-build-isolation .\n\n\n6. Install Axolotl\nClone and install Axolotl:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\npip install packaging ninja\npip install --no-build-isolation -e .\n\n\n7. Apply xformers Workaround\nxformers appears to be incompatible with ROCm. Apply the following workarounds:\n- Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers.\n- Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.\n\n\n8. Prepare Job Submission Script\nCreate a script for job submission using your HPC’s particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include\nexport TRANSFORMERS_OFFLINE=1\nexport HF_DATASETS_OFFLINE=1\n\n\n9. Download Base Model\nDownload a base model using the Hugging Face CLI:\nhuggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B\n\n\n10. Create Axolotl Configuration\nCreate an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.\nNote: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.\n\n\n11. Preprocess Data\nRun preprocessing on the login node:\nCUDA_VISIBLE_DEVICES=\"\" python -m axolotl.cli.preprocess /path/to/your/config.yaml\n\n\n12. Train\nYou are now ready to submit your previously prepared job script. 🚂",
-    "crumbs": [
-      "Deployments",
-      "AMD GPUs on HPC Systems"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html",
-    "href": "docs/sequence_parallelism.html",
-    "title": "Sequence Parallelism",
-    "section": "",
-    "text": "Sequence parallelism is a technique that splits sequences across multiple GPUs,\nallowing you to train with very long sequences that wouldn’t fit on a single GPU. Each\nGPU processes a different portion of the sequence, and the results are aggregated\nthrough a ring communication pattern.",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
-    "href": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
-    "title": "Sequence Parallelism",
-    "section": "When to Use Sequence Parallelism",
-    "text": "When to Use Sequence Parallelism\nUse sequence parallelism when:\n\nYou need to train with sequence lengths that don’t fit into a single GPU’s memory\nYou have multiple GPUs available\nYou’re experiencing OOM (Out Of Memory) errors with long sequences",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#configuration",
-    "href": "docs/sequence_parallelism.html#configuration",
-    "title": "Sequence Parallelism",
-    "section": "Configuration",
-    "text": "Configuration\nTo enable sequence parallelism, add the following to your configuration file:\n# Set to a divisor (&gt; 1) of the number of GPUs available\nsequence_parallel_degree: 4  # Split sequences across 4 GPUs\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n# Optional; one of \"varlen_llama3\" or \"batch_ring\". Defaults to\n# \"varlen_llama3\" when `sample_packing: true`, and \"batch_ring\" otherwise.\nring_attn_func:\nThe sequence_parallel_degree should be a divisor of the total number of GPUs. For example:\n\nWith 8 GPUs, valid values would be 2, 4, or 8\nWith 4 GPUs, valid values would be 2 or 4",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#implementation-details",
-    "href": "docs/sequence_parallelism.html#implementation-details",
-    "title": "Sequence Parallelism",
-    "section": "Implementation Details",
-    "text": "Implementation Details\nWhen sequence parallelism is enabled:\n\nEach sequence is divided into equal chunks across the GPUs in a sequence parallel group\nThe data collator handles the chunking of input_ids, attention_mask, labels, and position_ids\nPosition IDs are adjusted to maintain proper relative positions, especially for packed sequences\nThe trainer uses special ring communication patterns for attention operations",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#requirements",
-    "href": "docs/sequence_parallelism.html#requirements",
-    "title": "Sequence Parallelism",
-    "section": "Requirements",
-    "text": "Requirements\nTo use sequence parallelism, you need:\n\nMultiple GPUs (at least 2)\nThe ring-flash-attn package. Install with:\n\npip install axolotl[ring-flash-attn] (preferred)\npip install ring-flash-attn&gt;=0.1.4",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#limitations",
-    "href": "docs/sequence_parallelism.html#limitations",
-    "title": "Sequence Parallelism",
-    "section": "Limitations",
-    "text": "Limitations\n\nFlash attention must be enabled for this to work (flash_attention: true in config YAML)\nMay have a small performance overhead due to communication between GPUs",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#example",
-    "href": "docs/sequence_parallelism.html#example",
-    "title": "Sequence Parallelism",
-    "section": "Example",
-    "text": "Example\nbase_model: meta-llama/Llama-3-8B-Instruct\nsequence_len: 8192\n\n...\n\nsequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU\nflash_attention: true  # Required with sequence parallelism\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n\n...\nThis will train the Llama 3 8B model with 8K context length, with each sequence split\ninto 2 subsequences of length 4096 across 2 GPUs.",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
-    "href": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
-    "title": "Sequence Parallelism",
-    "section": "Sample Packing with Sequence Parallelism",
-    "text": "Sample Packing with Sequence Parallelism\nSequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:\n\nSamples are first packed together\nThe packed sequences are then divided across GPUs in the sequence parallel group\nPosition IDs are automatically adjusted to maintain proper relative positions",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#effect-on-batch-size",
-    "href": "docs/sequence_parallelism.html#effect-on-batch-size",
-    "title": "Sequence Parallelism",
-    "section": "Effect on Batch Size",
-    "text": "Effect on Batch Size\nWhen using sequence parallelism, your effective global batch size is divided by the sequence_parallel_degree. This happens because:\n\nEach group of sequence_parallel_degree GPUs works on the same batch (just different parts of each sequence)\nThe number of batches processed per step decreases\n\nFor example:\n- With 8 GPUs and no sequence parallelism: 8 different batches processed per step\n- With 8 GPUs and sequence_parallel_degree=4: Only 2 different batches processed per step (each split across 4 GPUs)\n- If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html",
-    "href": "docs/fsdp_qlora.html",
-    "title": "FDSP + QLoRA",
-    "section": "",
-    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#background",
-    "href": "docs/fsdp_qlora.html#background",
-    "title": "FDSP + QLoRA",
-    "section": "",
-    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#usage",
-    "href": "docs/fsdp_qlora.html#usage",
-    "title": "FDSP + QLoRA",
-    "section": "Usage",
-    "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip]\nSee the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#example-config",
-    "href": "docs/fsdp_qlora.html#example-config",
-    "title": "FDSP + QLoRA",
-    "section": "Example Config",
-    "text": "Example Config\nexamples/llama-2/qlora-fsdp.yml contains an example of how to enable QLoRA + FSDP in axolotl.",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#references",
-    "href": "docs/fsdp_qlora.html#references",
-    "title": "FDSP + QLoRA",
-    "section": "References",
-    "text": "References\n\nPR #1378 enabling QLoRA in FSDP in Axolotl.\nBlog Post from the Answer.AI team describing the work that enabled QLoRA in FSDP.\nRelated HuggingFace PRs Enabling FDSP + QLoRA:\n\nAccelerate PR#2544\nTransformers PR#29587\nTRL PR#1416\nPEFT PR#1550",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/fsdp_qlora.html#footnotes",
-    "href": "docs/fsdp_qlora.html#footnotes",
-    "title": "FDSP + QLoRA",
+    "objectID": "docs/debugging.html#footnotes",
+    "href": "docs/debugging.html#footnotes",
+    "title": "Debugging",
     "section": "Footnotes",
-    "text": "Footnotes\n\n\nThis was enabled by this work from the Answer.AI team.↩︎",
-    "crumbs": [
-      "Advanced Features",
-      "FDSP + QLoRA"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html",
-    "href": "docs/lr_groups.html",
-    "title": "Learning Rate Groups",
-    "section": "",
-    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html#background",
-    "href": "docs/lr_groups.html#background",
-    "title": "Learning Rate Groups",
-    "section": "",
-    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html#example",
-    "href": "docs/lr_groups.html#example",
-    "title": "Learning Rate Groups",
-    "section": "Example",
-    "text": "Example\nlr_groups:\n  - name: o_proj\n    modules:\n      - self_attn.o_proj.weight\n    lr: 1e-6\n  - name: q_proj\n    modules:\n      - model.layers.2.self_attn.q_proj.weight\n    lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate\nof 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s\nself attention q_proj module.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/batch_vs_grad.html",
-    "href": "docs/batch_vs_grad.html",
-    "title": "Batch size vs Gradient accumulation",
-    "section": "",
-    "text": "Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn’t significantly impact learning.\nThis method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here’s why:\n\nMemory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.\nGradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.\n\nExample 1:\nMicro batch size: 3\nGradient accumulation steps: 2\nNumber of GPUs: 3\nTotal batch size = 3 * 2 * 3 = 18\n| GPU 1          | GPU 2          | GPU 3          |\n|----------------|----------------|----------------|\n| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |\n| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |\n|----------------|----------------|----------------|\n| → (accumulate) | → (accumulate) | → (accumulate) |\n|----------------|----------------|----------------|\n| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |\n| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |\n|----------------|----------------|----------------|\n| → (apply)      | → (apply)      | → (apply)      |\n\nAccumulated gradient for the weight w1 after the second iteration (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18\n\nWeight update for w1:\nw1_new = w1_old - learning rate x (Total gradient for w1 / 18)\nExample 2:\nMicro batch size: 2\nGradient accumulation steps: 1\nNumber of GPUs: 3\nTotal batch size = 2 * 1 * 3 = 6\n| GPU 1     | GPU 2     | GPU 3     |\n|-----------|-----------|-----------|\n| S1, S2    | S3, S4    | S5, S6    |\n| e1, e2    | e3, e4    | e5, e6    |\n|-----------|-----------|-----------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6\n\nWeight update for w1:\nw1_new = w1_old - learning rate × (Total gradient for w1 / 6)",
-    "crumbs": [
-      "Core Concepts",
-      "Batch size vs Gradient accumulation"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/pretraining.html",
-    "href": "docs/dataset-formats/pretraining.html",
-    "title": "Pre-training",
-    "section": "",
-    "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset:\n  - name:\n    path:\n    split:\n    text_column: # column in dataset with the data, usually `text`\n    type: pretrain\n    trust_remote_code:\n    skip: # number of rows of data to skip over from the beginning",
-    "crumbs": [
-      "Dataset Formats",
-      "Pre-training"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/template_free.html",
-    "href": "docs/dataset-formats/template_free.html",
-    "title": "Template-Free",
-    "section": "",
-    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
-    "crumbs": [
-      "Dataset Formats",
-      "Template-Free"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/template_free.html#sec-background",
-    "href": "docs/dataset-formats/template_free.html#sec-background",
-    "title": "Template-Free",
-    "section": "",
-    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
-    "crumbs": [
-      "Dataset Formats",
-      "Template-Free"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/template_free.html#sec-usage",
-    "href": "docs/dataset-formats/template_free.html#sec-usage",
-    "title": "Template-Free",
-    "section": "Usage",
-    "text": "Usage\nThis is how you can use the input_output format:\n\n1. Prepare Data\nTo use the input_output format, collect your data in the following\nformat into a jsonl file (below is the first row from the file\noutput.jsonl` pretty printed):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\n\nSet label:false when you want to mask a segment of text so that the\nmodel isn’t trained on it. Some things to keep in mind:\n\n[!IMPORTANT]\n1. EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl\nconcatenates all the segments as-is. The tokenizer doesn’t add\nanything additional. Notice how I added spaces, newlines, &lt;s&gt;\n(BOS), and &lt;/s&gt; (EOS) myself.\n2. Make sure you check the materialized output to validate that the\nprompt is getting assembled how you like.\n\n\n\n2. Use type: input_output\nLet’s materialize data with our output.jsonl file by setting\ntype: input_output in our axolotl config:\n# training_config.yaml\nbase_model: mistralai/Mistral-7B-v0.1\ndata_seed: 49\nseed: 49\n\ndatasets:\n  - path: output.jsonl\n    type: input_output\nval_set_size: 0.1\n\nsequence_len: 896\nsample_packing: false\n\nmicro_batch_size: 2\ngradient_accumulation_steps: 3\neval_batch_size: 2\nnum_epochs: 1\nlearning_rate: 0.0002\n\ntrain_on_inputs: false\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\nYou can use the following command to materialize your data. The\n--debug flag will print the tokens, along with the labels so you can\nverify that the correct items are being ignored:\naxolotl preprocess training_config.yaml --debug\n\n...\n[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] &lt;s&gt;(1, 1) Hello(22557, 22557)\n(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) &lt;/s&gt;(2, 2)\nThe format is decoded_token(label, token_id), for example,\n&lt;s&gt;(1, 1) means that the token is &lt;s&gt;, the label is 1 and the\ntoken_id is 1. When the label is -100 then that token is ignored for\ntraining.\n\n\n3. Check the prompts\nHere is another way to check the materialized output:\nfrom transformers import AutoTokenizer\nfrom datasets import load_from_disk\nimport yaml\n\ndirectory = !ls last_run_prepared/\nwith open('training_config.yaml', 'r') as f:\n    cfg = yaml.safe_load(f)\nmodel_id = cfg['base_model']\ntok = AutoTokenizer.from_pretrained(model_id)\nds = load_from_disk(f'last_run_prepared/{directory[0]}/')\n&gt;&gt;&gt; row = ds[0]\n&gt;&gt;&gt; print(tok.decode(row['input_ids']))\n&lt;s&gt; Hello\n    hi there!.  goodbye  farewell&lt;/s&gt;\nWe can check that the right tokens are ignored by comparing the labels\nto each token:\nimport pandas as pd\npd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in\n              zip(row['input_ids'], row['labels'])])\n\n\n\ntoken\nlabel\nid\n\n\n\n\n0\n&lt;s&gt;\n1\n\n\n1\nHello\n22557\n\n\n2\n\\n\n13\n\n\n3\nhi\n12014\n\n\n4\nthere\n736\n\n\n5\n!\n28808\n\n\n6\n.\n28723\n\n\n7\n\n28705\n\n\n8\ngood\n-100\n\n\n9\nbye\n-100\n\n\n10\n\n-100\n\n\n11\nfare\n19111\n\n\n12\nwell\n5458\n\n\n13\n&lt;/s&gt;\n2\n\n\n\nIf we look at the input data, the above table seems correct! (The jsonl\nversion is repeated below for reference):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}",
-    "crumbs": [
-      "Dataset Formats",
-      "Template-Free"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html",
-    "href": "docs/dataset-formats/index.html",
-    "title": "Dataset Formats",
-    "section": "",
-    "text": "Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.\nAs there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.\nAxolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html#pre-training",
-    "href": "docs/dataset-formats/index.html#pre-training",
-    "title": "Dataset Formats",
-    "section": "Pre-training",
-    "text": "Pre-training\nWhen aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.\nA sample format for a pre-training dataset is as follows:\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\nIt is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.\nAxolotl supports loading from a Hugging Face hub repo or from local files.\n\n\n\n\n\n\nImportant\n\n\n\nFor pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.\n\n\n\nPre-training from Hugging Face hub datasets\nAs an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:\npretraining_dataset: hf_org/name\n\n\nPre-training from local dataset files\nGiven a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:\npretraining_dataset:\n  - path: json\n    data_files:\n      - A.jsonl\n      - B.jsonl\n      - C.jsonl\nWhile we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset\n\n\nPre-training without streaming\nOn the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.\nOne benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.\nFrom Hugging Face:\ndatasets:\n  - path: hf_org/name\n    type: completion\nFrom local files (either example works):\ndatasets:\n  - path: A.jsonl\n    type: completion\n\n  - path: json\n    data_files: [\"A.jsonl\", \"B.jsonl\", \"C.jsonl\"]\n    type: completion\n\n\nPre-training dataset configuration tips\n\nSetting max_steps\nWhen using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.\nTherefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.\nOne step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.\n\n\nGroup_by_length\nIt is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.\n\n\n\nReference\nPlease see docs here.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
-    "href": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
-    "title": "Dataset Formats",
-    "section": "Supervised fine-tuning (SFT)",
-    "text": "Supervised fine-tuning (SFT)\nSupervised fine-tuning is the process of training models to respond to an instruction or chat input.\nAs there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.\nAxolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.\nA flow chart is as follows:\n\nDo you already have the dataset tokenized? If yes, check Pre-Tokenized Dataset.\nDo you want to format the dataset yourself and manually choose each section to mask? If yes, check Template Free Dataset\nIs your dataset in a “conversation” format, containing a list[messages]? If yes, check Conversation Dataset\nIs your dataset in an “instruct” format, containing { instruction, response }? If yes, check Instruction Dataset\n\nIf you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.\n\n\n\n\n\n\nTip\n\n\n\nYou can mix and match within each approach or across approaches to train a model on a variety of datasets.\n\n\n\nPre-Tokenized Dataset\nWe suggest this approach when you want to bring your own tokenized dataset.\nAxolotl expects the dataset to have three keys:\n\ninput_ids: from tokenizing formatted prompt\nattention_mask: for masking padding. If you don’t add padding, it would be equal to len(input_ids) * [1]\nlabels: this is the same as input_ids, however, if you want to mask certain tokens, you would set those indices to -100.\n\n\n\n\n\n\n\nTip\n\n\n\nMake sure to add BOS/EOS tokens to your prompt and mask it appropriately.\n\n\nA config for this would look like:\ndatasets:\n  - path: A.jsonl\n    type:\n\n\n\n\n\n\nNote\n\n\n\ntype: is empty!\n\n\nReference: Pre-Tokenized Dataset Documentation.\n\n\nTemplate Free Dataset\nWe reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.\nIn the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\nEach prompt must be have a key called segments which is a list of { text, label }.\ndatasets:\n  - path: A.jsonl\n    type: input_output\nReference: Template Free Documentation.\n\n\nConversation Dataset\nconversation messages are a list of messages which usually contain a role and content key.\n\n\n\n\n\n\nTip\n\n\n\nFun fact: Axolotl synonymously refers to “chat” messages as conversation messages due to how FastChat initially used this term to build a widely used fastchat conversation method for formatting chat messages prior to the creation of chat_templates.\n\n\n\nWhat are chat_templates?\nThe current most popular and convenient method for inference is to use chat_templates for formatting prompts. Axolotl supports using chat_templates for training to ensure that the model performs in the same environment as in inference.\nHere’s a quick rundown on chat_template: A chat_template is a Jinja2 template which formats a list of messages into a prompt.\nAn example of a prompt formatted into a popular template called ChatML can be seen below:\nSingle prompt (pretty-printed):\n{\n    \"messages\": [\n        {\n            \"role\": \"user\",\n            \"content\": \"Hi\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"How can I help you?\"\n        },\n        {\n            \"role\": \"user\",\n            \"content\": \"Can you add 3+5?\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"The answer is 8.\"\n        }\n    ]\n}\nThe ChatML template is as follows:\n{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'&lt;|im_start|&gt;' + message['role'] + '\\n' + message['content'] + '&lt;|im_end|&gt;' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '&lt;|im_start|&gt;assistant\\n' }}{% endif %}\nThe above prompt formatted into this template will result in:\n&lt;|im_start|&gt;user\nHi&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nHow can I help you?&lt;|im_end|&gt;\n&lt;|im_start|&gt;user\nCan you add 3+5?&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nThe answer is 8.&lt;|im_end|&gt;\nBy using delimiters (&lt;|im_start|&gt; and &lt;|im_end|&gt;), a prompt separates different speakers which helps the model identify which portion belongs to whom.\n\n\nCommon Conversation Dataset formats\nOlder conversation datasets with the following format are colloquially called sharegpt datasets.\n{\"conversations\": [{\"from\": \"...\", \"value\": \"...\"}]}\nNewer conversation datasets usually follow the OpenAI format.\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}]}\nAxolotl supports both as well as allowing customization of any kind of key.\n\n\nChat Template Usage\nTo properly use this method, it is important to identify three things:\n\nWhich chat_template would you use?\nWhat are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be messages, role, and content, respectively, whereas the possible roles are system, user, and assistant.\nWhat do you want to mask? For instance, only assistant messages, only last message, or nothing.\n\n\nChoosing a chat_template\nThere are a lot of chat_templates out there. Axolotl supports the common ones: supported chat templates. For example, to use ChatML, it would be chat_template: chatml.\nHowever, it is also possible to use the already configured template within the tokenizer by specifying chat_template: tokenizer_default. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do chat_template: tokenizer_default_fallback_chatml to fallback to the ChatML template if a tokenizer template was not found.\nOne last but powerful approach is to bring your own template. This can be set via:\nchat_template_jinja: # your template\n\n\nSetting chat_template dataset keys\nWe currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.\nIf your dataset format is different, here are the keys you should check (with their defaults):\ndatasets:\n    ...\n    field_messages: messages  # this should point to the key containing the list of conversations\n    message_property_mappings:  # this is a mapping from keys in your dataset to keys in chat_template\n      role: role\n      content: content\nIn some chat_templates (e.g. Gemma), the roles are hardcoded to user and assistant. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a KeyError, it would be necessary to add mapping for your roles. Here is an example of how it would look like:\ndatasets:\n    ...\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\nIn the example above, all gpt and model values are converted to assistant. All human values are converted to user.\n\n\nHandling masking\nThe common use case for chat_template is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.\nTo train on all assistant messages, you would set the following configs.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\nThe train_on_eos config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: all and last to choose which EOS to train on.\nPerhaps, you want to train on assistant and narrator roles, you can simply add narrator to the list of roles_to_train. You would also need to add it to the mapping of roles above.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\", \"narrator\"]\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\n      narrator: [\"narrator\"]\n\n\n\n\n\n\nTip\n\n\n\nAs chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, ChatML uses &lt;|im_end|&gt; to end turns.\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\n\n\n\n\nApplying chat_template\nOnce all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.\ndatasets:\n  - path: A.jsonl\n    type: chat_template\n\n    # step 1\n    chat_template: chatml\n\n    # step 2\n    field_messages: messages\n    message_property_mappings:\n      role: role\n      content: content\n\n    roles:\n      assistant:\n        - gpt\n        - model\n        - assistant\n      user:\n        - human\n        - user\n\n    # step 3\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\n\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\nIf this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via axolotl preprocess config.yaml --debug):\n&lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Hi(-100, 13347) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) How(4438, 4438)  can(649, 649)  I(358, 358)  help(1520, 1520)  you(499, 499) ?(30, 30) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Can(-100, 6854)  you(-100, 499)  add(-100, 923)  (-100, 220) 3(-100, 18) +(-100, 10) 5(-100, 20) ?(-100, 30) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) The(791, 791)  answer(4320, 4320)  is(374, 374)  (220, 220) 8(23, 23) .(13, 13) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198)\nThe first number refers to the label, the second refers to the token_id. For example, -100 labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the token_id.\n\n\n\n\n\n\nNote\n\n\n\nIf during preprocess, there are a lot of warnings of Could not find content __ boundary, please check the FAQ section for chat_templates.\n\n\n\n\n\nReference\nPlease see docs here.\n\n\n\nInstruction Dataset\nInstruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.\nAn example is of a common format called Alpaca:\n{\"instruction\": \"...\", \"input\": \"...\", \"output\": \"...\"}\nUsing those keys, a prompt can be built based on it.\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{output}\nThis can be configured as such:\ndatasets:\n  - path: A.jsonl\n    type: alpaca\nAxolotl supports many kinds of instruction dataset. All of them can be found in the Instruction Dataset Documentation with their respective type and sample row format.\n\nCustom Instruct Prompt Format\nDue to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.\nIn the example below, a sample row is used to output in mistral_v1 format.\n{\"input\": \"...\", \"output\": \"...\"}\ndatasets:\n  - path: repo\n    type:\n      system_prompt: \"\"\n\n      field_system:\n      field_instruction: input\n      field_input:\n      field_output: output\n\n      # multi-line example with input\n      format: |-\n        [INST] {instruction} {input} [/INST]\n\n      # single-line example without input\n      no_input_format: \"[INST] {instruction} [/INST]\"\nThe config sets that the field_instruction is actually named input, and the field_input is empty as we don’t have an input in this sample. Generally, instruction can be thought as the question to the model, and input as the additional information with output being the response. It is not necessary to have an input nor system. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.\nReference: Custom Instruct Prompt Format Documentation.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
-    "href": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
-    "title": "Dataset Formats",
-    "section": "Reinforcement Learning from Human Feedback (RLHF)",
-    "text": "Reinforcement Learning from Human Feedback (RLHF)\nAs there are multiple RLHF methods with their own dataset requirements. Please see RLHF documentation for more detail.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/tokenized.html",
-    "href": "docs/dataset-formats/tokenized.html",
-    "title": "Custom Pre-Tokenized Dataset",
-    "section": "",
-    "text": "Pass an empty type: in your axolotl config.\nColumns in Dataset must be exactly input_ids, attention_mask, labels\nTo indicate that a token should be ignored during training, set its corresponding label to -100.\nYou must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.\nFor pretraining, do not truncate/pad documents to the context window length.\nFor instruction training, documents must be truncated/padded as desired.\n\nSample config:\n\n\nconfig.yml\n\ndatasets:\n  - path: /path/to/your/file.jsonl\n    ds_type: json\n    type:\n\nSample jsonl:\n{\"input_ids\":[271,299,99],\"attention_mask\":[1,1,1],\"labels\":[271,-100,99]}\n{\"input_ids\":[87,227,8383,12],\"attention_mask\":[1,1,1,1],\"labels\":[87,227,8383,12]}",
-    "crumbs": [
-      "Dataset Formats",
-      "Custom Pre-Tokenized Dataset"
-    ]
-  },
-  {
-    "objectID": "docs/nccl.html",
-    "href": "docs/nccl.html",
-    "title": "NCCL",
-    "section": "",
-    "text": "NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several environment variables. A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:\nWatchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.\nOften, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends disabling PCI access control services (ACS) as a possible solution if this is available to you.\nForcing cross-GPU communication via NVLink may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:\nnvidia-smi nvlink --status\nTo force NCCL to use NVLink, simply set this in the environment:\nexport NCCL_P2P_LEVEL=NVL\nIf NVLink is not available in your environment there are other options for NCCL_P2P_LEVEL in the table below:\n\n\n\n\n\n\n\nNCCL_P2P_LEVEL\nDescription\n\n\n\n\nPIX\nP2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication.\n\n\nPXB\nP2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency.\n\n\nPHB\nP2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL)\n\n\n\nTo validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:\n./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3\nIt can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:\nexport NCCL_DEBUG=INFO\nexport NCCL_DEBUG_SUBSYS=ALL\nexport TORCH_DISTRIBUTED_DEBUG=INFO\nexport TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log\nFinally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ddp_timeout value in the Axolotl configuration. See PyTorch init_process_group for documentation on this value.",
+    "text": "Footnotes\n\n\nThe config actually mimics the command CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml, but this is the same thing.↩︎\nMany of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags here.↩︎",
     "crumbs": [
       "Troubleshooting",
-      "NCCL"
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html",
-    "href": "docs/getting-started.html",
-    "title": "Quickstart",
+    "objectID": "docs/inference.html",
+    "href": "docs/inference.html",
+    "title": "Inference and Merging",
     "section": "",
-    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
+    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
     "crumbs": [
       "Getting Started",
-      "Quickstart"
+      "Inference and Merging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-quick-example",
-    "href": "docs/getting-started.html#sec-quick-example",
-    "title": "Quickstart",
-    "section": "1 Quick Example",
-    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
+    "objectID": "docs/inference.html#sec-quickstart",
+    "href": "docs/inference.html#sec-quickstart",
+    "title": "Inference and Merging",
+    "section": "1 Quick Start",
+    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
     "crumbs": [
       "Getting Started",
-      "Quickstart"
+      "Inference and Merging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-understanding",
-    "href": "docs/getting-started.html#sec-understanding",
-    "title": "Quickstart",
-    "section": "2 Understanding the Process",
-    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our Config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
+    "objectID": "docs/inference.html#sec-advanced",
+    "href": "docs/inference.html#sec-advanced",
+    "title": "Inference and Merging",
+    "section": "2 Advanced Usage",
+    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
     "crumbs": [
       "Getting Started",
-      "Quickstart"
+      "Inference and Merging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-custom",
-    "href": "docs/getting-started.html#sec-custom",
-    "title": "Quickstart",
-    "section": "3 Your First Custom Training",
-    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected `alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
+    "objectID": "docs/inference.html#sec-merging",
+    "href": "docs/inference.html#sec-merging",
+    "title": "Inference and Merging",
+    "section": "3 Merging LoRA Weights",
+    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
     "crumbs": [
       "Getting Started",
-      "Quickstart"
+      "Inference and Merging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-common-tasks",
-    "href": "docs/getting-started.html#sec-common-tasks",
-    "title": "Quickstart",
-    "section": "4 Common Tasks",
-    "text": "4 Common Tasks\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\n\n\n4.2 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\n\n\n4.3 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio",
+    "objectID": "docs/inference.html#sec-tokenization",
+    "href": "docs/inference.html#sec-tokenization",
+    "title": "Inference and Merging",
+    "section": "4 Tokenization",
+    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
     "crumbs": [
       "Getting Started",
-      "Quickstart"
+      "Inference and Merging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-next-steps",
-    "href": "docs/getting-started.html#sec-next-steps",
-    "title": "Quickstart",
-    "section": "5 Next Steps",
-    "text": "5 Next Steps\nNow that you have the basics, you might want to:\n\nTry different model architectures\nExperiment with hyperparameters\nUse more advanced training methods\nScale up to larger models\n\nCheck our other guides for details on these topics:\n\nConfiguration Guide - Full configuration options\nDataset Formats - Working with different data formats\nMulti-GPU Training\nMulti-Node Training",
+    "objectID": "docs/inference.html#sec-troubleshooting",
+    "href": "docs/inference.html#sec-troubleshooting",
+    "title": "Inference and Merging",
+    "section": "5 Troubleshooting",
+    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
     "crumbs": [
       "Getting Started",
-      "Quickstart"
+      "Inference and Merging"
     ]
   },
   {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html",
-    "title": "Setting up",
+    "objectID": "docs/dataset-formats/conversation.html",
+    "href": "docs/dataset-formats/conversation.html",
+    "title": "Conversation",
     "section": "",
-    "text": "import torch\n# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\nassert (torch.cuda.is_available()==True)\n!pip install --no-build-isolation axolotl[deepspeed]"
+    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"content\": \"...\"}]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\n\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\n\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\n\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
+    "crumbs": [
+      "Dataset Formats",
+      "Conversation"
+    ]
   },
   {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
-    "title": "Setting up",
-    "section": "Hugging Face login (optional)",
-    "text": "Hugging Face login (optional)\n\nfrom huggingface_hub import notebook_login\nnotebook_login()"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
-    "title": "Setting up",
-    "section": "Example configuration",
-    "text": "Example configuration\n\nimport yaml\n\nyaml_string = \"\"\"\nbase_model: NousResearch/Meta-Llama-3.1-8B\n\nload_in_8bit: false\nload_in_4bit: true\nstrict: false\n\ndatasets:\n  - path: tatsu-lab/alpaca\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.05\noutput_dir: ./outputs/lora-out\n\nsequence_len: 2048\nsample_packing: true\neval_sample_packing: true\npad_to_sequence_len: true\n\nadapter: qlora\nlora_model_dir:\nlora_r: 32\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_linear: true\nlora_fan_in_fan_out:\nlora_modules_to_save:\n  - embed_tokens\n  - lm_head\n\nwandb_project:\nwandb_entity:\nwandb_watch:\nwandb_name:\nwandb_log_model:\n\ngradient_accumulation_steps: 2\nmicro_batch_size: 1\nnum_epochs: 1\noptimizer: paged_adamw_8bit\nlr_scheduler: cosine\nlearning_rate: 2e-5\n\ntrain_on_inputs: false\ngroup_by_length: false\nbf16: auto\nfp16:\ntf32: false\n\ngradient_checkpointing: true\nearly_stopping_patience:\nresume_from_checkpoint:\nlogging_steps: 1\nxformers_attention:\nflash_attention: false\nsdp_attention: true\n\nwarmup_steps: 1\nmax_steps: 25\nevals_per_epoch: 1\neval_table_size:\nsaves_per_epoch: 1\ndebug:\ndeepspeed:\nweight_decay: 0.0\nfsdp:\nfsdp_config:\nspecial_tokens:\n  pad_token: &lt;|end_of_text|&gt;\n\"\"\"\n\n\n# Convert the YAML string to a Python dictionary\nyaml_dict = yaml.safe_load(yaml_string)\n\n# Specify your file path\nfile_path = 'test_axolotl.yaml'\n\n# Write the YAML file\nwith open(file_path, 'w') as file:\n    yaml.dump(yaml_dict, file)\n\nAbove we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\nThe Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let’s go through them line by line:\n\n“base model”: String value, specifies the underlying pre-trained LLM that will be used for finetuning\n\nNext we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n\n“load_in_8bit”: Boolean value, whether to quantize the model weights into 8-bit integer.\n“load_in_4bit”: Boolean value, whether to quantize the model weights into 4-bit integer.\n“strict”: Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n“datasets”: a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n“val_set_size”: Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n“output_dir”: String value. Path of trained model.\n\nFor data preprocessing:\n\n“sequence_len”: Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n“pad_to_sequence_len”: Boolean. Padding input to maximum sequence length.\n“sample_packing”: Boolean. Specifies whether to use multi-packing with block diagonal attention.\n“special_tokens”: Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n\nFor LoRA configuration and its hyperparamters:\n\n“adapter”: String. Either “lora” or “qlora”, depending on user’s choice.\n“lora_model_dir”: String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n“lora_r”: Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n“lora_alpha”: Integer. Scale the weight matrices by \\(\\frac{\\text{lora_alpha}}{\\text{lora_r}}\\)Recommended to be fixed at 16.\n“lora_dropout”: Float that is 1 or less. The dropout probability of a lora layer.\n“lora_target_linear”: Boolean. If true, lora will target all linear modules in the transformers architecture.\n“lora_modules_to_save”: If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n\nSee LoRA for detailed explanation of LoRA implementation.\nFor the training configurations:\n\n“gradient_accumulation_steps”: Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n“micro_batch_size”: Integer. Batch size per gpu / gradient_accumulation_steps\n“num_epochs”: Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n“optimizer”: The optimizer to use for the training.\n“learning_rate”: The learning rate.\n“lr_scheduler”: The learning rate scheduler to use for adjusting learning rate during training.\n“train_on_inputs”: Boolean. Whether to ignore or include the user’s prompt from the training labels.\n“group_by_length”: Boolean. Whether to group similarly sized data to minimize padding.\n“bf16”: Either “auto”, “true”, or “false”. Whether to use CUDA bf16 floating point format. If set to “auto”, will automatically apply bf16 should the gpu supports it.\n“fp16”: Optional. Specifies whether to use CUDA fp16. Automatically set to true if “bf16” is set to true. Otherwise false.\n“tf32”: Boolean. Whether to use CUDA tf32. Will override bf16.\n“gradient_checkpointing”: Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n“gradient_checkpointing_kwargs”: Python Dict. Fed into the trainer.\n“logging_steps”: Integer. Log training information over every specified number of steps.\n“flash_attention”: Boolean. Whether to use the flash attention mechanism.\n“sdp_attention”: Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the original implementation of transformers.)\n“warmup_steps”: Integer. The number of pre-training steps where a very low learning rate is used.\n“evals_per_epoch”: Integer. Number of evaluations to be performed within one training epoch.\n“saves_per_epoch”: Integer. Number of times the model is saved in one training epoch.\n“weight_decay”: Positive Float. Sets the “strength” of weight decay (i.e. setting the coefficient of L2 regularization)\n\nThe above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see here\nTrain the model\n\n!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml\n\nPredict with trained model\n\n!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
-    "title": "Setting up",
-    "section": "Deeper Dive",
-    "text": "Deeper Dive\nIt is also helpful to gain some familiarity over some of the core inner workings of axolotl"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
-    "title": "Setting up",
-    "section": "Configuration Normalization",
-    "text": "Configuration Normalization\nAxolotl uses a custom Dict class, called DictDefault\nto store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py\nDictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
-    "title": "Setting up",
-    "section": "Loading Models, Tokenizers, and Trainer",
-    "text": "Loading Models, Tokenizers, and Trainer\nIf we inspect cli.train.py, we will find that most of the heavy lifting were done by the function train() which is itself imported from src/axolotl/train.py.\ntrain() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.\nload_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.\nModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/\nAnother important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py.\ntrainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )"
-  },
-  {
-    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
-    "href": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
-    "title": "Setting up",
-    "section": "Monkey patch",
-    "text": "Monkey patch\nThe Monkey patch directory is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
-  },
-  {
-    "objectID": "docs/rlhf.html",
-    "href": "docs/rlhf.html",
-    "title": "RLHF (Beta)",
+    "objectID": "docs/dataset-formats/conversation.html#chat_template",
+    "href": "docs/dataset-formats/conversation.html#chat_template",
+    "title": "Conversation",
     "section": "",
-    "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)",
+    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"content\": \"...\"}]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\n\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\n\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\n\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
     "crumbs": [
-      "How To Guides",
-      "RLHF (Beta)"
+      "Dataset Formats",
+      "Conversation"
     ]
   },
   {
-    "objectID": "docs/rlhf.html#overview",
-    "href": "docs/rlhf.html#overview",
-    "title": "RLHF (Beta)",
+    "objectID": "docs/dataset-formats/conversation.html#sharegpt",
+    "href": "docs/dataset-formats/conversation.html#sharegpt",
+    "title": "Conversation",
+    "section": "sharegpt",
+    "text": "sharegpt\n\n\n\n\n\n\nImportant\n\n\n\nShareGPT is deprecated!. Please see chat_template section.",
+    "crumbs": [
+      "Dataset Formats",
+      "Conversation"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/conversation.html#pygmalion",
+    "href": "docs/dataset-formats/conversation.html#pygmalion",
+    "title": "Conversation",
+    "section": "pygmalion",
+    "text": "pygmalion\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"value\": \"...\"}]}",
+    "crumbs": [
+      "Dataset Formats",
+      "Conversation"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/stepwise_supervised.html",
+    "href": "docs/dataset-formats/stepwise_supervised.html",
+    "title": "Stepwise Supervised Format",
     "section": "",
-    "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)",
+    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
     "crumbs": [
-      "How To Guides",
-      "RLHF (Beta)"
+      "Dataset Formats",
+      "Stepwise Supervised Format"
     ]
   },
   {
-    "objectID": "docs/rlhf.html#rlhf-using-axolotl",
-    "href": "docs/rlhf.html#rlhf-using-axolotl",
-    "title": "RLHF (Beta)",
-    "section": "RLHF using Axolotl",
-    "text": "RLHF using Axolotl\n\n\n\n\n\n\nImportant\n\n\n\nThis is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.\n\n\nWe rely on the TRL library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.\n\n\n\n\n\n\nTip\n\n\n\nYou can find what each method supports by going into src/axolotl/prompt_strategies/{method} where {method} is one of our supported methods. The type: can be retrieved from {method}.{function_name}.\n\n\n\nDPO\nExample config:\nrl: dpo\ndatasets:\n  - path: Intel/orca_dpo_pairs\n    split: train\n    type: chatml.intel\n  - path: argilla/ultrafeedback-binarized-preferences\n    split: train\n    type: chatml\nDPO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"chosen_response\": \"...\",\n    \"rejected_response\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nchatml.icr\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nchatml.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nchatml.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nllama3.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"chosen_response\": \"...\",\n    \"rejected_response\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nllama3.icr\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nllama3.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nllama3.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nzephyr.nectar\n{\n    \"prompt\": \"...\",\n    \"answers\": [\n        {\n            \"answer\": \"...\",\n            \"rank\": 1\n        },\n        {\n            \"answer\": \"...\",\n            \"rank\": 2\n        }\n        // ... more answers with ranks\n    ]\n}\n\n\nchat_template.default\nrl: dpo\ndatasets:\n  - path: ...\n    split: train\n    type: chat_template.default\n    field_messages: \"messages\"\n    field_chosen: \"chosen\"\n    field_rejected: \"rejected\"\n    message_property_mappings:\n      role: role\n      content: content\n    roles:\n      user: [\"user\"]\n      assistant: [\"assistant\"]\n      system: [\"system\"]\nSample input format:\n{\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": \"...\"\n        },\n        {\n            \"role\": \"user\",\n            \"content\": \"...\"\n        },\n        // ... more messages\n    ],\n    \"chosen\": {\n        \"role\": \"assistant\",\n        \"content\": \"...\"\n    },\n    \"rejected\": {\n        \"role\": \"assistant\",\n        \"content\": \"...\"\n    }\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: dpo\ndatasets:\n  - path: ...\n    split: train\n    type: user_defined.default\n\n    field_prompt: \"prompt\"\n    field_system: \"system\"\n    field_chosen: \"chosen\"\n    field_rejected: \"rejected\"\n    prompt_format: \"{prompt}\"\n    chosen_format: \"{chosen}\"\n    rejected_format: \"{rejected}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n    \"system\": \"...\",  // optional\n    \"prompt\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\n\nIPO\nAs IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.\nrl: ipo\n\n\nORPO\nPaper: https://arxiv.org/abs/2403.07691\nrl: orpo\norpo_alpha: 0.1\nremove_unused_columns: false\n\nchat_template: chatml\ndatasets:\n  - path: argilla/ultrafeedback-binarized-preferences-cleaned\n    type: chat_template.argilla\nORPO supports the following types with the following dataset format:\n\nchat_template.argilla\n{\n    \"system\": \"...\",  // optional\n    \"prompt\": \"...\",  // if available, will be taken as user message for single-turn instead of from list below\n\n    // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\n\nKTO\nrl: kto\nrl_beta: 0.1  # default\nkto_desirable_weight: 1.0  # default\nkto_undesirable_weight: 1.0  # default\n\nremove_unused_columns: false\n\ndatasets:\n  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto\n    type: llama3.ultra\n    split: train\n\ngradient_checkpointing: true\ngradient_checkpointing_kwargs:\n  use_reentrant: true\nKTO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"}\n    ],\n    \"completion\": [\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nchatml.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nchatml.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n    \"completion\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nllama3.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: kto\ndatasets:\n  - path: ...\n    split: train\n    type: user_defined.default\n\n    field_prompt: \"prompt\"\n    field_system: \"system\"\n    field_completion: \"completion\"\n    field_label: \"label\"\n    prompt_format: \"{prompt}\"\n    completion_format: \"{completion}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n    \"system\": \"...\",  // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\",\n    \"label\": \"...\"\n}\n\n\n\nGRPO\n\n\n\n\n\n\nTip\n\n\n\nCheck out our GRPO cookbook.\n\n\nIn the latest GRPO implementation, vLLM is used to significantly speedup trajectory generation during training. In this example, we’re using 4 GPUs - 2 for training, and 2 for vLLM:\n\n\n\n\n\n\nImportant\n\n\n\nMake sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].\n\n\nbase_model: Qwen/Qwen2.5-1.5B-Instruct\n\nvllm:\n    host: 0.0.0.0\n    port: 8000\n    tensor_parallel_size: 2\n    gpu_memory_utilization: 0.85\n    dtype: auto\n    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand\n\nrl: grpo\ntrl:\n    use_vllm: true\n    vllm_server_host: 0.0.0.0\n    vllm_server_port: 8000\n    vllm_server_timeout: 300\nCUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml\nYour vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:\nCUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2\n\n\n\n\n\n\nNote\n\n\n\nDue to TRL’s implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use CUDA_VISIBLE_DEVICES=2,3 for the vLLM instance.\n\n\n\nReward functions\nGRPO uses custom reward functions and transformations. Please have them ready locally.\nFor example, to load OpenAI’s GSM8K and use a random reward for completions:\n# rewards.py\nimport random\n\ndef rand_reward_func(completions, **kwargs) -&gt; list[float]:\n    return [random.uniform(0, 1) for _ in completions]\n\ndef oai_gsm8k_transform(cfg, *args, **kwargs):\n    def transform_fn(example, tokenizer=None):\n        label = example[\"answer\"].split(\"####\")[-1].strip().replace(\",\", \"\")\n        return {\n            \"prompt\": [{\"role\": \"user\", \"content\": example[\"question\"]},],\n            \"answer\": label,\n        }\n    return transform_fn, {\"remove_columns\": [\"question\"]}\nrl: grpo\n\ntrl:\n    beta: 0.001\n    max_completion_length: 256\n    use_vllm: True\n    num_generations: 4\n    reward_funcs: [\"rewards.rand_reward_func\"]    # format: '{file_name}.{fn_name}'\n    reward_weights: [1.0]\ndatasets:\n  - path: openai/gsm8k\n    name: main\n    type: rewards.oai_gsm8k_transform  # format: '{file_name}.{fn_name}'\nTo see other examples of custom reward functions, please see TRL GRPO Docs.\nTo see description of the configs, please see TRLConfig.\n\n\n\nSimPO\nSimPO uses CPOTrainer but with alternative loss function.\nrl: simpo\nrl_beta: 0.1  # default in CPOTrainer\ncpo_alpha: 1.0  # default in CPOTrainer\nsimpo_gamma: 0.5  # default in CPOTrainer\nThis method uses the same dataset format as DPO.\n\n\nUsing local dataset files\ndatasets:\n  - ds_type: json\n    data_files:\n      - orca_rlhf.jsonl\n    split: train\n    type: chatml.intel\n\n\nTRL auto-unwrapping for PEFT\nTRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:\n# load ref model when adapter training.\nrl_adapter_ref_model: true",
-    "crumbs": [
-      "How To Guides",
-      "RLHF (Beta)"
-    ]
-  },
-  {
-    "objectID": "docs/mac.html",
-    "href": "docs/mac.html",
-    "title": "Mac M-series",
+    "objectID": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
+    "href": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
+    "title": "Stepwise Supervised Format",
     "section": "",
-    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested:\n\nFSDP",
+    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
     "crumbs": [
-      "Deployments",
-      "Mac M-series"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html",
-    "href": "docs/custom_integrations.html",
-    "title": "Custom Integrations",
-    "section": "",
-    "text": "Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.\nTo enable them, please check the respective documentations.",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#cut-cross-entropy",
-    "href": "docs/custom_integrations.html#cut-cross-entropy",
-    "title": "Custom Integrations",
-    "section": "Cut Cross Entropy",
-    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\nllama\nllama4\nllama4_text\nmllama\nphi3\ngemma\ngemma2\ngemma3\ngemma3_text\nmistral\nmistral3\nqwen2\nqwen2_moe\nqwen2_vl\nqwen2_5_vl\nqwen3\nqwen3_moe\ncohere\ncohere2\nglm\nglm4\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#grokfast",
-    "href": "docs/custom_integrations.html#grokfast",
-    "title": "Custom Integrations",
-    "section": "Grokfast",
-    "text": "Grokfast\nSee https://github.com/ironjr/grokfast\n\nUsage\nplugins:\n  - axolotl.integrations.grokfast.GrokfastPlugin\n\ngrokfast_alpha: 2.0\ngrokfast_lamb: 0.98\n\n\nCitation\n@article{lee2024grokfast,\n    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},\n    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},\n    journal={arXiv preprint arXiv:2405.20233},\n    year={2024}\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#knowledge-distillation-kd",
-    "href": "docs/custom_integrations.html#knowledge-distillation-kd",
-    "title": "Custom Integrations",
-    "section": "Knowledge Distillation (KD)",
-    "text": "Knowledge Distillation (KD)\n\nUsage\nplugins:\n  - \"axolotl.integrations.kd.KDPlugin\"\n\nkd_trainer: True\nkd_ce_alpha: 0.1\nkd_alpha: 0.9\nkd_temperature: 1.0\n\ntorch_compile: True  # torch&gt;=2.5.1, recommended to reduce vram\n\ndatasets:\n  - path: ...\n    type: \"axolotl.integrations.kd.chat_template\"\n    field_messages: \"messages_combined\"\n    logprobs_field: \"llm_text_generation_vllm_logprobs\"  # for kd only, field of logprobs\nAn example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#liger-kernels",
-    "href": "docs/custom_integrations.html#liger-kernels",
-    "title": "Custom Integrations",
-    "section": "Liger Kernels",
-    "text": "Liger Kernels\nLiger Kernel provides efficient Triton kernels for LLM training, offering:\n\n20% increase in multi-GPU training throughput\n60% reduction in memory usage\nCompatibility with both FSDP and DeepSpeed\n\nSee https://github.com/linkedin/Liger-Kernel\n\nUsage\nplugins:\n  - axolotl.integrations.liger.LigerPlugin\nliger_rope: true\nliger_rms_norm: true\nliger_glu_activation: true\nliger_layer_norm: true\nliger_fused_linear_cross_entropy: true\n\n\nSupported Models\n\ndeepseek_v2\ngemma\ngemma2\ngemma3\ngranite\njamba\nllama\nmistral\nmixtral\nmllama\nmllama_text_model\nolmo2\npaligemma\nphi3\nqwen2\nqwen2_5_vl\nqwen2_vl\n\n\n\nCitation\n@article{hsu2024ligerkernelefficienttriton,\n      title={Liger Kernel: Efficient Triton Kernels for LLM Training},\n      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},\n      year={2024},\n      eprint={2410.10989},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2410.10989},\n      journal={arXiv preprint arXiv:2410.10989},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
-    "href": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
-    "title": "Custom Integrations",
-    "section": "Language Model Evaluation Harness (LM Eval)",
-    "text": "Language Model Evaluation Harness (LM Eval)\nRun evaluation on model using the popular lm-evaluation-harness library.\nSee https://github.com/EleutherAI/lm-evaluation-harness\n\nUsage\nplugins:\n  - axolotl.integrations.lm_eval.LMEvalPlugin\n\nlm_eval_tasks:\n  - gsm8k\n  - hellaswag\n  - arc_easy\n\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\n\n\nCitation\n@misc{eval-harness,\n  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},\n  title        = {A framework for few-shot language model evaluation},\n  month        = 07,\n  year         = 2024,\n  publisher    = {Zenodo},\n  version      = {v0.4.3},\n  doi          = {10.5281/zenodo.12608602},\n  url          = {https://zenodo.org/records/12608602}\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#spectrum",
-    "href": "docs/custom_integrations.html#spectrum",
-    "title": "Custom Integrations",
-    "section": "Spectrum",
-    "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.\nBy identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n  - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n      title={Spectrum: Targeted Training on Signal to Noise Ratio},\n      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n      year={2024},\n      eprint={2406.06623},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#llmcompressor",
-    "href": "docs/custom_integrations.html#llmcompressor",
-    "title": "Custom Integrations",
-    "section": "LLMCompressor",
-    "text": "LLMCompressor\nFine-tune sparsified models in Axolotl using Neural Magic’s LLMCompressor.\nThis integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor’s model compression capabilities with Axolotl’s distributed training pipelines, users can efficiently fine-tune sparse models at scale.\nIt uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.\n\n\nRequirements\n\nAxolotl with llmcompressor extras:\npip install \"axolotl[llmcompressor]\"\nRequires llmcompressor &gt;= 0.5.1\n\nThis will install all necessary dependencies to fine-tune sparsified models using the integration.\n\n\n\nUsage\nTo enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:\nplugins:\n  - axolotl.integrations.llm_compressor.LLMCompressorPlugin\n\nllmcompressor:\n  recipe:\n    finetuning_stage:\n      finetuning_modifiers:\n        ConstantPruningModifier:\n          targets: [\n            're:.*q_proj.weight',\n            're:.*k_proj.weight',\n            're:.*v_proj.weight',\n            're:.*o_proj.weight',\n            're:.*gate_proj.weight',\n            're:.*up_proj.weight',\n            're:.*down_proj.weight',\n          ]\n          start: 0\n  save_compressed: true\nThis plugin does not apply pruning or sparsification itself — it is intended for fine-tuning models that have already been sparsified.\nPre-sparsified checkpoints can be:\n- Generated using LLMCompressor\n- Downloaded from Neural Magic’s Hugging Face page\n- Any custom LLM with compatible sparsity patterns that you’ve created yourself\nTo learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:\nhttps://github.com/vllm-project/llm-compressor/blob/main/README.md\n\n\nStorage Optimization with save_compressed\nSetting save_compressed: true in your configuration enables saving models in a compressed format, which:\n- Reduces disk space usage by approximately 40%\n- Maintains compatibility with vLLM for accelerated inference\n- Maintains compatibility with llmcompressor for further optimization (example: quantization)\nThis option is highly recommended when working with sparse models to maximize the benefits of model compression.\n\n\nExample Config\nSee examples/llama-3/sparse-finetuning.yaml for a complete example.\n\n\n\nInference with vLLM\nAfter fine-tuning your sparse model, you can leverage vLLM for efficient inference.\nYou can also use LLMCompressor to apply additional quantization to your fine-tuned\nsparse model before inference for even greater performance benefits.:\nfrom vllm import LLM, SamplingParams\n\nprompts = [\n    \"Hello, my name is\",\n    \"The president of the United States is\",\n    \"The capital of France is\",\n    \"The future of AI is\",\n]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\nllm = LLM(\"path/to/your/sparse/model\")\noutputs = llm.generate(prompts, sampling_params)\n\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\nFor more details on vLLM’s capabilities and advanced configuration options, see the official vLLM documentation.\n\n\nLearn More\nFor details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:\nhttps://github.com/vllm-project/llm-compressor\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#adding-a-new-integration",
-    "href": "docs/custom_integrations.html#adding-a-new-integration",
-    "title": "Custom Integrations",
-    "section": "Adding a new integration",
-    "text": "Adding a new integration\nPlugins can be used to customize the behavior of the training pipeline through hooks. See axolotl.integrations.BasePlugin for the possible hooks.\nTo add a new integration, please follow these steps:\n\nCreate a new folder in the src/axolotl/integrations directory.\nAdd any relevant files (LICENSE, README.md, ACKNOWLEDGEMENTS.md, etc.) to the new folder.\nAdd __init__.py and args.py files to the new folder.\n\n\n__init__.py should import the integration and hook into the appropriate functions.\nargs.py should define the arguments for the integration.\n\n\n(If applicable) Add CPU tests under tests/integrations or GPU tests under tests/e2e/integrations.\n\n\n\n\n\n\n\nTip\n\n\n\nSee src/axolotl/integrations/cut_cross_entropy for a minimal integration example.\n\n\n\n\n\n\n\n\nWarning\n\n\n\nIf you could not load your integration, please ensure you are pip installing in editable mode.\npip install -e .\nand correctly spelled the integration name in the config file.\nplugins:\n  - axolotl.integrations.your_integration_name.YourIntegrationPlugin\n\n\n\n\n\n\n\n\nNote\n\n\n\nIt is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env.\nSee this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
+      "Dataset Formats",
+      "Stepwise Supervised Format"
     ]
   },
   {
@@ -2104,972 +1997,1093 @@
     ]
   },
   {
-    "objectID": "docs/dataset-formats/stepwise_supervised.html",
-    "href": "docs/dataset-formats/stepwise_supervised.html",
-    "title": "Stepwise Supervised Format",
+    "objectID": "docs/custom_integrations.html",
+    "href": "docs/custom_integrations.html",
+    "title": "Custom Integrations",
     "section": "",
-    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
+    "text": "Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.\nTo enable them, please check the respective documentations.",
     "crumbs": [
-      "Dataset Formats",
-      "Stepwise Supervised Format"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
-    "href": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
-    "title": "Stepwise Supervised Format",
+    "objectID": "docs/custom_integrations.html#cut-cross-entropy",
+    "href": "docs/custom_integrations.html#cut-cross-entropy",
+    "title": "Custom Integrations",
+    "section": "Cut Cross Entropy",
+    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@bad6f7b49c75fdec69471abb71b4cddd0f0c6438\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\nllama\nllama4\nllama4_text\nmllama\nphi3\ngemma\ngemma2\ngemma3\ngemma3_text\nmistral\nmistral3\nqwen2\nqwen2_moe\nqwen2_vl\nqwen2_5_vl\nqwen3\nqwen3_moe\ncohere\ncohere2\nglm\nglm4\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#grokfast",
+    "href": "docs/custom_integrations.html#grokfast",
+    "title": "Custom Integrations",
+    "section": "Grokfast",
+    "text": "Grokfast\nSee https://github.com/ironjr/grokfast\n\nUsage\nplugins:\n  - axolotl.integrations.grokfast.GrokfastPlugin\n\ngrokfast_alpha: 2.0\ngrokfast_lamb: 0.98\n\n\nCitation\n@article{lee2024grokfast,\n    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},\n    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},\n    journal={arXiv preprint arXiv:2405.20233},\n    year={2024}\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#knowledge-distillation-kd",
+    "href": "docs/custom_integrations.html#knowledge-distillation-kd",
+    "title": "Custom Integrations",
+    "section": "Knowledge Distillation (KD)",
+    "text": "Knowledge Distillation (KD)\n\nUsage\nplugins:\n  - \"axolotl.integrations.kd.KDPlugin\"\n\nkd_trainer: True\nkd_ce_alpha: 0.1\nkd_alpha: 0.9\nkd_temperature: 1.0\n\ntorch_compile: True  # torch&gt;=2.5.1, recommended to reduce vram\n\ndatasets:\n  - path: ...\n    type: \"axolotl.integrations.kd.chat_template\"\n    field_messages: \"messages_combined\"\n    logprobs_field: \"llm_text_generation_vllm_logprobs\"  # for kd only, field of logprobs\nAn example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#liger-kernels",
+    "href": "docs/custom_integrations.html#liger-kernels",
+    "title": "Custom Integrations",
+    "section": "Liger Kernels",
+    "text": "Liger Kernels\nLiger Kernel provides efficient Triton kernels for LLM training, offering:\n\n20% increase in multi-GPU training throughput\n60% reduction in memory usage\nCompatibility with both FSDP and DeepSpeed\n\nSee https://github.com/linkedin/Liger-Kernel\n\nUsage\nplugins:\n  - axolotl.integrations.liger.LigerPlugin\nliger_rope: true\nliger_rms_norm: true\nliger_glu_activation: true\nliger_layer_norm: true\nliger_fused_linear_cross_entropy: true\n\n\nSupported Models\n\ndeepseek_v2\ngemma\ngemma2\ngemma3\ngranite\njamba\nllama\nmistral\nmixtral\nmllama\nmllama_text_model\nolmo2\npaligemma\nphi3\nqwen2\nqwen2_5_vl\nqwen2_vl\n\n\n\nCitation\n@article{hsu2024ligerkernelefficienttriton,\n      title={Liger Kernel: Efficient Triton Kernels for LLM Training},\n      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},\n      year={2024},\n      eprint={2410.10989},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2410.10989},\n      journal={arXiv preprint arXiv:2410.10989},\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
+    "href": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
+    "title": "Custom Integrations",
+    "section": "Language Model Evaluation Harness (LM Eval)",
+    "text": "Language Model Evaluation Harness (LM Eval)\nRun evaluation on model using the popular lm-evaluation-harness library.\nSee https://github.com/EleutherAI/lm-evaluation-harness\n\nUsage\nplugins:\n  - axolotl.integrations.lm_eval.LMEvalPlugin\n\nlm_eval_tasks:\n  - gsm8k\n  - hellaswag\n  - arc_easy\n\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\n\n\nCitation\n@misc{eval-harness,\n  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},\n  title        = {A framework for few-shot language model evaluation},\n  month        = 07,\n  year         = 2024,\n  publisher    = {Zenodo},\n  version      = {v0.4.3},\n  doi          = {10.5281/zenodo.12608602},\n  url          = {https://zenodo.org/records/12608602}\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#spectrum",
+    "href": "docs/custom_integrations.html#spectrum",
+    "title": "Custom Integrations",
+    "section": "Spectrum",
+    "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.\nBy identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n  - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n      title={Spectrum: Targeted Training on Signal to Noise Ratio},\n      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n      year={2024},\n      eprint={2406.06623},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#llmcompressor",
+    "href": "docs/custom_integrations.html#llmcompressor",
+    "title": "Custom Integrations",
+    "section": "LLMCompressor",
+    "text": "LLMCompressor\nFine-tune sparsified models in Axolotl using Neural Magic’s LLMCompressor.\nThis integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor’s model compression capabilities with Axolotl’s distributed training pipelines, users can efficiently fine-tune sparse models at scale.\nIt uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.\n\n\nRequirements\n\nAxolotl with llmcompressor extras:\npip install \"axolotl[llmcompressor]\"\nRequires llmcompressor &gt;= 0.5.1\n\nThis will install all necessary dependencies to fine-tune sparsified models using the integration.\n\n\n\nUsage\nTo enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:\nplugins:\n  - axolotl.integrations.llm_compressor.LLMCompressorPlugin\n\nllmcompressor:\n  recipe:\n    finetuning_stage:\n      finetuning_modifiers:\n        ConstantPruningModifier:\n          targets: [\n            're:.*q_proj.weight',\n            're:.*k_proj.weight',\n            're:.*v_proj.weight',\n            're:.*o_proj.weight',\n            're:.*gate_proj.weight',\n            're:.*up_proj.weight',\n            're:.*down_proj.weight',\n          ]\n          start: 0\n  save_compressed: true\nThis plugin does not apply pruning or sparsification itself — it is intended for fine-tuning models that have already been sparsified.\nPre-sparsified checkpoints can be:\n- Generated using LLMCompressor\n- Downloaded from Neural Magic’s Hugging Face page\n- Any custom LLM with compatible sparsity patterns that you’ve created yourself\nTo learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:\nhttps://github.com/vllm-project/llm-compressor/blob/main/README.md\n\n\nStorage Optimization with save_compressed\nSetting save_compressed: true in your configuration enables saving models in a compressed format, which:\n- Reduces disk space usage by approximately 40%\n- Maintains compatibility with vLLM for accelerated inference\n- Maintains compatibility with llmcompressor for further optimization (example: quantization)\nThis option is highly recommended when working with sparse models to maximize the benefits of model compression.\n\n\nExample Config\nSee examples/llama-3/sparse-finetuning.yaml for a complete example.\n\n\n\nInference with vLLM\nAfter fine-tuning your sparse model, you can leverage vLLM for efficient inference.\nYou can also use LLMCompressor to apply additional quantization to your fine-tuned\nsparse model before inference for even greater performance benefits.:\nfrom vllm import LLM, SamplingParams\n\nprompts = [\n    \"Hello, my name is\",\n    \"The president of the United States is\",\n    \"The capital of France is\",\n    \"The future of AI is\",\n]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\nllm = LLM(\"path/to/your/sparse/model\")\noutputs = llm.generate(prompts, sampling_params)\n\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\nFor more details on vLLM’s capabilities and advanced configuration options, see the official vLLM documentation.\n\n\nLearn More\nFor details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:\nhttps://github.com/vllm-project/llm-compressor\nPlease see reference here",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/custom_integrations.html#adding-a-new-integration",
+    "href": "docs/custom_integrations.html#adding-a-new-integration",
+    "title": "Custom Integrations",
+    "section": "Adding a new integration",
+    "text": "Adding a new integration\nPlugins can be used to customize the behavior of the training pipeline through hooks. See axolotl.integrations.BasePlugin for the possible hooks.\nTo add a new integration, please follow these steps:\n\nCreate a new folder in the src/axolotl/integrations directory.\nAdd any relevant files (LICENSE, README.md, ACKNOWLEDGEMENTS.md, etc.) to the new folder.\nAdd __init__.py and args.py files to the new folder.\n\n\n__init__.py should import the integration and hook into the appropriate functions.\nargs.py should define the arguments for the integration.\n\n\n(If applicable) Add CPU tests under tests/integrations or GPU tests under tests/e2e/integrations.\n\n\n\n\n\n\n\nTip\n\n\n\nSee src/axolotl/integrations/cut_cross_entropy for a minimal integration example.\n\n\n\n\n\n\n\n\nWarning\n\n\n\nIf you could not load your integration, please ensure you are pip installing in editable mode.\npip install -e .\nand correctly spelled the integration name in the config file.\nplugins:\n  - axolotl.integrations.your_integration_name.YourIntegrationPlugin\n\n\n\n\n\n\n\n\nNote\n\n\n\nIt is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env.\nSee this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer",
+    "crumbs": [
+      "Advanced Features",
+      "Custom Integrations"
+    ]
+  },
+  {
+    "objectID": "docs/mac.html",
+    "href": "docs/mac.html",
+    "title": "Mac M-series",
     "section": "",
-    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
+    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested:\n\nFSDP",
     "crumbs": [
-      "Dataset Formats",
-      "Stepwise Supervised Format"
+      "Deployments",
+      "Mac M-series"
     ]
   },
   {
-    "objectID": "docs/dataset-formats/conversation.html",
-    "href": "docs/dataset-formats/conversation.html",
-    "title": "Conversation",
+    "objectID": "docs/rlhf.html",
+    "href": "docs/rlhf.html",
+    "title": "RLHF (Beta)",
     "section": "",
-    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"content\": \"...\"}]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\n\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\n\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\n\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
+    "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)",
     "crumbs": [
-      "Dataset Formats",
-      "Conversation"
+      "How To Guides",
+      "RLHF (Beta)"
     ]
   },
   {
-    "objectID": "docs/dataset-formats/conversation.html#chat_template",
-    "href": "docs/dataset-formats/conversation.html#chat_template",
-    "title": "Conversation",
+    "objectID": "docs/rlhf.html#overview",
+    "href": "docs/rlhf.html#overview",
+    "title": "RLHF (Beta)",
     "section": "",
-    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"content\": \"...\"}]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\n\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\n\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\n\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\n\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
+    "text": "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human\nfeedback. Various methods include, but not limited to:\n\nDirect Preference Optimization (DPO)\nIdentity Preference Optimization (IPO)\nKahneman-Tversky Optimization (KTO)\nOdds Ratio Preference Optimization (ORPO)\nProximal Policy Optimization (PPO) (not yet supported in axolotl)",
     "crumbs": [
-      "Dataset Formats",
-      "Conversation"
+      "How To Guides",
+      "RLHF (Beta)"
     ]
   },
   {
-    "objectID": "docs/dataset-formats/conversation.html#sharegpt",
-    "href": "docs/dataset-formats/conversation.html#sharegpt",
-    "title": "Conversation",
-    "section": "sharegpt",
-    "text": "sharegpt\n\n\n\n\n\n\nImportant\n\n\n\nShareGPT is deprecated!. Please see chat_template section.",
+    "objectID": "docs/rlhf.html#rlhf-using-axolotl",
+    "href": "docs/rlhf.html#rlhf-using-axolotl",
+    "title": "RLHF (Beta)",
+    "section": "RLHF using Axolotl",
+    "text": "RLHF using Axolotl\n\n\n\n\n\n\nImportant\n\n\n\nThis is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.\n\n\nWe rely on the TRL library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.\n\n\n\n\n\n\nTip\n\n\n\nYou can find what each method supports by going into src/axolotl/prompt_strategies/{method} where {method} is one of our supported methods. The type: can be retrieved from {method}.{function_name}.\n\n\n\nDPO\nExample config:\nrl: dpo\ndatasets:\n  - path: Intel/orca_dpo_pairs\n    split: train\n    type: chatml.intel\n  - path: argilla/ultrafeedback-binarized-preferences\n    split: train\n    type: chatml\nDPO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"chosen_response\": \"...\",\n    \"rejected_response\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nchatml.icr\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nchatml.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nchatml.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nllama3.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"chosen_response\": \"...\",\n    \"rejected_response\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nllama3.icr\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nllama3.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nllama3.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nzephyr.nectar\n{\n    \"prompt\": \"...\",\n    \"answers\": [\n        {\n            \"answer\": \"...\",\n            \"rank\": 1\n        },\n        {\n            \"answer\": \"...\",\n            \"rank\": 2\n        }\n        // ... more answers with ranks\n    ]\n}\n\n\nchat_template.default\nrl: dpo\ndatasets:\n  - path: ...\n    split: train\n    type: chat_template.default\n    field_messages: \"messages\"\n    field_chosen: \"chosen\"\n    field_rejected: \"rejected\"\n    message_property_mappings:\n      role: role\n      content: content\n    roles:\n      user: [\"user\"]\n      assistant: [\"assistant\"]\n      system: [\"system\"]\nSample input format:\n{\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": \"...\"\n        },\n        {\n            \"role\": \"user\",\n            \"content\": \"...\"\n        },\n        // ... more messages\n    ],\n    \"chosen\": {\n        \"role\": \"assistant\",\n        \"content\": \"...\"\n    },\n    \"rejected\": {\n        \"role\": \"assistant\",\n        \"content\": \"...\"\n    }\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: dpo\ndatasets:\n  - path: ...\n    split: train\n    type: user_defined.default\n\n    field_prompt: \"prompt\"\n    field_system: \"system\"\n    field_chosen: \"chosen\"\n    field_rejected: \"rejected\"\n    prompt_format: \"{prompt}\"\n    chosen_format: \"{chosen}\"\n    rejected_format: \"{rejected}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n    \"system\": \"...\",  // optional\n    \"prompt\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\n\nIPO\nAs IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.\nrl: ipo\n\n\nORPO\nPaper: https://arxiv.org/abs/2403.07691\nrl: orpo\norpo_alpha: 0.1\nremove_unused_columns: false\n\nchat_template: chatml\ndatasets:\n  - path: argilla/ultrafeedback-binarized-preferences-cleaned\n    type: chat_template.argilla\nORPO supports the following types with the following dataset format:\n\nchat_template.argilla\n{\n    \"system\": \"...\",  // optional\n    \"prompt\": \"...\",  // if available, will be taken as user message for single-turn instead of from list below\n\n    // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ],\n    \"rejected\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\n\nKTO\nrl: kto\nrl_beta: 0.1  # default\nkto_desirable_weight: 1.0  # default\nkto_undesirable_weight: 1.0  # default\n\nremove_unused_columns: false\n\ndatasets:\n  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto\n    type: llama3.ultra\n    split: train\n\ngradient_checkpointing: true\ngradient_checkpointing_kwargs:\n  use_reentrant: true\nKTO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n    \"chosen\": [\n        {\"role\": \"user\", \"content\": \"...\"}\n    ],\n    \"completion\": [\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nchatml.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nchatml.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.argilla\n{\n    \"system\": \"...\", // optional\n    \"instruction\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n    \"completion\": [\n        {\"role\": \"user\", \"content\": \"...\"},\n        {\"role\": \"assistant\", \"content\": \"...\"}\n    ]\n}\n\n\nllama3.intel\n{\n    \"system\": \"...\", // optional\n    \"question\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nllama3.ultra\n{\n    \"system\": \"...\", // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\"\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: kto\ndatasets:\n  - path: ...\n    split: train\n    type: user_defined.default\n\n    field_prompt: \"prompt\"\n    field_system: \"system\"\n    field_completion: \"completion\"\n    field_label: \"label\"\n    prompt_format: \"{prompt}\"\n    completion_format: \"{completion}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n    \"system\": \"...\",  // optional\n    \"prompt\": \"...\",\n    \"completion\": \"...\",\n    \"label\": \"...\"\n}\n\n\n\nGRPO\n\n\n\n\n\n\nTip\n\n\n\nCheck out our GRPO cookbook.\n\n\nIn the latest GRPO implementation, vLLM is used to significantly speedup trajectory generation during training. In this example, we’re using 4 GPUs - 2 for training, and 2 for vLLM:\n\n\n\n\n\n\nImportant\n\n\n\nMake sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].\n\n\nbase_model: Qwen/Qwen2.5-1.5B-Instruct\n\nvllm:\n    host: 0.0.0.0\n    port: 8000\n    tensor_parallel_size: 2\n    gpu_memory_utilization: 0.85\n    dtype: auto\n    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand\n\nrl: grpo\ntrl:\n    use_vllm: true\n    vllm_server_host: 0.0.0.0\n    vllm_server_port: 8000\n    vllm_server_timeout: 300\nCUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml\nYour vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:\nCUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2\n\n\n\n\n\n\nNote\n\n\n\nDue to TRL’s implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use CUDA_VISIBLE_DEVICES=2,3 for the vLLM instance.\n\n\n\nReward functions\nGRPO uses custom reward functions and transformations. Please have them ready locally.\nFor example, to load OpenAI’s GSM8K and use a random reward for completions:\n# rewards.py\nimport random\n\ndef rand_reward_func(completions, **kwargs) -&gt; list[float]:\n    return [random.uniform(0, 1) for _ in completions]\n\ndef oai_gsm8k_transform(cfg, *args, **kwargs):\n    def transform_fn(example, tokenizer=None):\n        label = example[\"answer\"].split(\"####\")[-1].strip().replace(\",\", \"\")\n        return {\n            \"prompt\": [{\"role\": \"user\", \"content\": example[\"question\"]},],\n            \"answer\": label,\n        }\n    return transform_fn, {\"remove_columns\": [\"question\"]}\nrl: grpo\n\ntrl:\n    beta: 0.001\n    max_completion_length: 256\n    use_vllm: True\n    num_generations: 4\n    reward_funcs: [\"rewards.rand_reward_func\"]    # format: '{file_name}.{fn_name}'\n    reward_weights: [1.0]\ndatasets:\n  - path: openai/gsm8k\n    name: main\n    type: rewards.oai_gsm8k_transform  # format: '{file_name}.{fn_name}'\nTo see other examples of custom reward functions, please see TRL GRPO Docs.\nTo see description of the configs, please see TRLConfig.\n\n\n\nSimPO\nSimPO uses CPOTrainer but with alternative loss function.\nrl: simpo\nrl_beta: 0.1  # default in CPOTrainer\ncpo_alpha: 1.0  # default in CPOTrainer\nsimpo_gamma: 0.5  # default in CPOTrainer\nThis method uses the same dataset format as DPO.\n\n\nUsing local dataset files\ndatasets:\n  - ds_type: json\n    data_files:\n      - orca_rlhf.jsonl\n    split: train\n    type: chatml.intel\n\n\nTRL auto-unwrapping for PEFT\nTRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:\n# load ref model when adapter training.\nrl_adapter_ref_model: true",
     "crumbs": [
-      "Dataset Formats",
-      "Conversation"
+      "How To Guides",
+      "RLHF (Beta)"
     ]
   },
   {
-    "objectID": "docs/dataset-formats/conversation.html#pygmalion",
-    "href": "docs/dataset-formats/conversation.html#pygmalion",
-    "title": "Conversation",
-    "section": "pygmalion",
-    "text": "pygmalion\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"value\": \"...\"}]}",
-    "crumbs": [
-      "Dataset Formats",
-      "Conversation"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html",
-    "href": "docs/inference.html",
-    "title": "Inference and Merging",
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html",
+    "title": "Setting up",
     "section": "",
-    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
+    "text": "import torch\n# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\nassert (torch.cuda.is_available()==True)\n!pip install --no-build-isolation axolotl[deepspeed]"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#hugging-face-login-optional",
+    "title": "Setting up",
+    "section": "Hugging Face login (optional)",
+    "text": "Hugging Face login (optional)\n\nfrom huggingface_hub import notebook_login\nnotebook_login()"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#example-configuration",
+    "title": "Setting up",
+    "section": "Example configuration",
+    "text": "Example configuration\n\nimport yaml\n\nyaml_string = \"\"\"\nbase_model: NousResearch/Meta-Llama-3.1-8B\n\nload_in_8bit: false\nload_in_4bit: true\nstrict: false\n\ndatasets:\n  - path: tatsu-lab/alpaca\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.05\noutput_dir: ./outputs/lora-out\n\nsequence_len: 2048\nsample_packing: true\neval_sample_packing: true\npad_to_sequence_len: true\n\nadapter: qlora\nlora_model_dir:\nlora_r: 32\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_linear: true\nlora_fan_in_fan_out:\nlora_modules_to_save:\n  - embed_tokens\n  - lm_head\n\nwandb_project:\nwandb_entity:\nwandb_watch:\nwandb_name:\nwandb_log_model:\n\ngradient_accumulation_steps: 2\nmicro_batch_size: 1\nnum_epochs: 1\noptimizer: paged_adamw_8bit\nlr_scheduler: cosine\nlearning_rate: 2e-5\n\ntrain_on_inputs: false\ngroup_by_length: false\nbf16: auto\nfp16:\ntf32: false\n\ngradient_checkpointing: true\nearly_stopping_patience:\nresume_from_checkpoint:\nlogging_steps: 1\nxformers_attention:\nflash_attention: false\nsdp_attention: true\n\nwarmup_steps: 1\nmax_steps: 25\nevals_per_epoch: 1\neval_table_size:\nsaves_per_epoch: 1\ndebug:\ndeepspeed:\nweight_decay: 0.0\nfsdp:\nfsdp_config:\nspecial_tokens:\n  pad_token: &lt;|end_of_text|&gt;\n\"\"\"\n\n\n# Convert the YAML string to a Python dictionary\nyaml_dict = yaml.safe_load(yaml_string)\n\n# Specify your file path\nfile_path = 'test_axolotl.yaml'\n\n# Write the YAML file\nwith open(file_path, 'w') as file:\n    yaml.dump(yaml_dict, file)\n\nAbove we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.\nThe Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let’s go through them line by line:\n\n“base model”: String value, specifies the underlying pre-trained LLM that will be used for finetuning\n\nNext we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.\n\n“load_in_8bit”: Boolean value, whether to quantize the model weights into 8-bit integer.\n“load_in_4bit”: Boolean value, whether to quantize the model weights into 4-bit integer.\n“strict”: Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.\n“datasets”: a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.\n“val_set_size”: Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.\n“output_dir”: String value. Path of trained model.\n\nFor data preprocessing:\n\n“sequence_len”: Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.\n“pad_to_sequence_len”: Boolean. Padding input to maximum sequence length.\n“sample_packing”: Boolean. Specifies whether to use multi-packing with block diagonal attention.\n“special_tokens”: Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.\n\nFor LoRA configuration and its hyperparamters:\n\n“adapter”: String. Either “lora” or “qlora”, depending on user’s choice.\n“lora_model_dir”: String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.\n“lora_r”: Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.\n“lora_alpha”: Integer. Scale the weight matrices by \\(\\frac{\\text{lora_alpha}}{\\text{lora_r}}\\)Recommended to be fixed at 16.\n“lora_dropout”: Float that is 1 or less. The dropout probability of a lora layer.\n“lora_target_linear”: Boolean. If true, lora will target all linear modules in the transformers architecture.\n“lora_modules_to_save”: If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n\nSee LoRA for detailed explanation of LoRA implementation.\nFor the training configurations:\n\n“gradient_accumulation_steps”: Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.\n“micro_batch_size”: Integer. Batch size per gpu / gradient_accumulation_steps\n“num_epochs”: Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.\n“optimizer”: The optimizer to use for the training.\n“learning_rate”: The learning rate.\n“lr_scheduler”: The learning rate scheduler to use for adjusting learning rate during training.\n“train_on_inputs”: Boolean. Whether to ignore or include the user’s prompt from the training labels.\n“group_by_length”: Boolean. Whether to group similarly sized data to minimize padding.\n“bf16”: Either “auto”, “true”, or “false”. Whether to use CUDA bf16 floating point format. If set to “auto”, will automatically apply bf16 should the gpu supports it.\n“fp16”: Optional. Specifies whether to use CUDA fp16. Automatically set to true if “bf16” is set to true. Otherwise false.\n“tf32”: Boolean. Whether to use CUDA tf32. Will override bf16.\n“gradient_checkpointing”: Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\n“gradient_checkpointing_kwargs”: Python Dict. Fed into the trainer.\n“logging_steps”: Integer. Log training information over every specified number of steps.\n“flash_attention”: Boolean. Whether to use the flash attention mechanism.\n“sdp_attention”: Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the original implementation of transformers.)\n“warmup_steps”: Integer. The number of pre-training steps where a very low learning rate is used.\n“evals_per_epoch”: Integer. Number of evaluations to be performed within one training epoch.\n“saves_per_epoch”: Integer. Number of times the model is saved in one training epoch.\n“weight_decay”: Positive Float. Sets the “strength” of weight decay (i.e. setting the coefficient of L2 regularization)\n\nThe above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see here\nTrain the model\n\n!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml\n\nPredict with trained model\n\n!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#deeper-dive",
+    "title": "Setting up",
+    "section": "Deeper Dive",
+    "text": "Deeper Dive\nIt is also helpful to gain some familiarity over some of the core inner workings of axolotl"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#configuration-normalization",
+    "title": "Setting up",
+    "section": "Configuration Normalization",
+    "text": "Configuration Normalization\nAxolotl uses a custom Dict class, called DictDefault\nto store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py\nDictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#loading-models-tokenizers-and-trainer",
+    "title": "Setting up",
+    "section": "Loading Models, Tokenizers, and Trainer",
+    "text": "Loading Models, Tokenizers, and Trainer\nIf we inspect cli.train.py, we will find that most of the heavy lifting were done by the function train() which is itself imported from src/axolotl/train.py.\ntrain() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.\nload_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.\nModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/\nAnother important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py.\ntrainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )"
+  },
+  {
+    "objectID": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
+    "href": "examples/colab-notebooks/colab-axolotl-example.html#monkey-patch",
+    "title": "Setting up",
+    "section": "Monkey patch",
+    "text": "Monkey patch\nThe Monkey patch directory is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization."
+  },
+  {
+    "objectID": "docs/getting-started.html",
+    "href": "docs/getting-started.html",
+    "title": "Quickstart",
+    "section": "",
+    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
     "crumbs": [
       "Getting Started",
-      "Inference and Merging"
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/inference.html#sec-quickstart",
-    "href": "docs/inference.html#sec-quickstart",
-    "title": "Inference and Merging",
-    "section": "1 Quick Start",
-    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
+    "objectID": "docs/getting-started.html#sec-quick-example",
+    "href": "docs/getting-started.html#sec-quick-example",
+    "title": "Quickstart",
+    "section": "1 Quick Example",
+    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
     "crumbs": [
       "Getting Started",
-      "Inference and Merging"
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/inference.html#sec-advanced",
-    "href": "docs/inference.html#sec-advanced",
-    "title": "Inference and Merging",
-    "section": "2 Advanced Usage",
-    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
+    "objectID": "docs/getting-started.html#sec-understanding",
+    "href": "docs/getting-started.html#sec-understanding",
+    "title": "Quickstart",
+    "section": "2 Understanding the Process",
+    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our Config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
     "crumbs": [
       "Getting Started",
-      "Inference and Merging"
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/inference.html#sec-merging",
-    "href": "docs/inference.html#sec-merging",
-    "title": "Inference and Merging",
-    "section": "3 Merging LoRA Weights",
-    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
+    "objectID": "docs/getting-started.html#sec-custom",
+    "href": "docs/getting-started.html#sec-custom",
+    "title": "Quickstart",
+    "section": "3 Your First Custom Training",
+    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected `alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
     "crumbs": [
       "Getting Started",
-      "Inference and Merging"
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/inference.html#sec-tokenization",
-    "href": "docs/inference.html#sec-tokenization",
-    "title": "Inference and Merging",
-    "section": "4 Tokenization",
-    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
+    "objectID": "docs/getting-started.html#sec-common-tasks",
+    "href": "docs/getting-started.html#sec-common-tasks",
+    "title": "Quickstart",
+    "section": "4 Common Tasks",
+    "text": "4 Common Tasks\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\n\n\n4.2 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\n\n\n4.3 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio",
     "crumbs": [
       "Getting Started",
-      "Inference and Merging"
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/inference.html#sec-troubleshooting",
-    "href": "docs/inference.html#sec-troubleshooting",
-    "title": "Inference and Merging",
-    "section": "5 Troubleshooting",
-    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
+    "objectID": "docs/getting-started.html#sec-next-steps",
+    "href": "docs/getting-started.html#sec-next-steps",
+    "title": "Quickstart",
+    "section": "5 Next Steps",
+    "text": "5 Next Steps\nNow that you have the basics, you might want to:\n\nTry different model architectures\nExperiment with hyperparameters\nUse more advanced training methods\nScale up to larger models\n\nCheck our other guides for details on these topics:\n\nConfiguration Guide - Full configuration options\nDataset Formats - Working with different data formats\nMulti-GPU Training\nMulti-Node Training",
     "crumbs": [
       "Getting Started",
-      "Inference and Merging"
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/debugging.html",
-    "href": "docs/debugging.html",
-    "title": "Debugging",
+    "objectID": "docs/nccl.html",
+    "href": "docs/nccl.html",
+    "title": "NCCL",
     "section": "",
-    "text": "This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.",
+    "text": "NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several environment variables. A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:\nWatchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.\nOften, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends disabling PCI access control services (ACS) as a possible solution if this is available to you.\nForcing cross-GPU communication via NVLink may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:\nnvidia-smi nvlink --status\nTo force NCCL to use NVLink, simply set this in the environment:\nexport NCCL_P2P_LEVEL=NVL\nIf NVLink is not available in your environment there are other options for NCCL_P2P_LEVEL in the table below:\n\n\n\n\n\n\n\nNCCL_P2P_LEVEL\nDescription\n\n\n\n\nPIX\nP2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication.\n\n\nPXB\nP2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency.\n\n\nPHB\nP2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL)\n\n\n\nTo validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:\n./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3\nIt can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:\nexport NCCL_DEBUG=INFO\nexport NCCL_DEBUG_SUBSYS=ALL\nexport TORCH_DISTRIBUTED_DEBUG=INFO\nexport TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log\nFinally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ddp_timeout value in the Axolotl configuration. See PyTorch init_process_group for documentation on this value.",
     "crumbs": [
       "Troubleshooting",
-      "Debugging"
+      "NCCL"
     ]
   },
   {
-    "objectID": "docs/debugging.html#table-of-contents",
-    "href": "docs/debugging.html#table-of-contents",
-    "title": "Debugging",
-    "section": "Table of Contents",
-    "text": "Table of Contents\n\nGeneral Tips\nDebugging with VSCode\n\nBackground\nConfiguration\nCustomizing your debugger\nVideo Tutorial\n\nDebugging With Docker\n\nSetup\nAttach To Container\nVideo - Attaching To Docker On Remote Host",
+    "objectID": "docs/dataset-formats/tokenized.html",
+    "href": "docs/dataset-formats/tokenized.html",
+    "title": "Custom Pre-Tokenized Dataset",
+    "section": "",
+    "text": "Pass an empty type: in your axolotl config.\nColumns in Dataset must be exactly input_ids, attention_mask, labels\nTo indicate that a token should be ignored during training, set its corresponding label to -100.\nYou must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.\nFor pretraining, do not truncate/pad documents to the context window length.\nFor instruction training, documents must be truncated/padded as desired.\n\nSample config:\n\n\nconfig.yml\n\ndatasets:\n  - path: /path/to/your/file.jsonl\n    ds_type: json\n    type:\n\nSample jsonl:\n{\"input_ids\":[271,299,99],\"attention_mask\":[1,1,1],\"labels\":[271,-100,99]}\n{\"input_ids\":[87,227,8383,12],\"attention_mask\":[1,1,1,1],\"labels\":[87,227,8383,12]}",
     "crumbs": [
-      "Troubleshooting",
-      "Debugging"
+      "Dataset Formats",
+      "Custom Pre-Tokenized Dataset"
     ]
   },
   {
-    "objectID": "docs/debugging.html#general-tips",
-    "href": "docs/debugging.html#general-tips",
-    "title": "Debugging",
-    "section": "General Tips",
-    "text": "General Tips\nWhile debugging it’s helpful to simplify your test scenario as much as possible. Here are some tips for doing so:\n\n[!Important]\nAll of these tips are incorporated into the example configuration for debugging with VSCode below.\n\n\nMake sure you are using the latest version of axolotl: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from main.\nEliminate concurrency: Restrict the number of processes to 1 for both training and data preprocessing:\n\nSet CUDA_VISIBLE_DEVICES to a single GPU, ex: export CUDA_VISIBLE_DEVICES=0.\nSet dataset_processes: 1 in your axolotl config or run the training command with --dataset_processes=1.\n\nUse a small dataset: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure sample_packing: False and eval_sample_packing: False to avoid errors. If you are in a pinch and don’t have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):\ndatasets:\n    ...\n    shards: 20\nUse a small model: A good example of a small model is TinyLlama/TinyLlama-1.1B-Chat-v1.0.\nMinimize iteration time: Make sure the training loop finishes as fast as possible, with these settings.\n\nmicro_batch_size: 1\nmax_steps: 1\nval_set_size: 0\n\nClear Caches: Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging.\n\nData preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in dataset_prepared_path: in your axolotl config. If you didn’t set this value, the default is last_run_prepared.\nHF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache HuggingFace cache, by deleting the appropriate ~/.cache/huggingface/datasets/... folder(s).\nThe recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run. This is demonstrated in the example configuration below.",
+    "objectID": "docs/dataset-formats/index.html",
+    "href": "docs/dataset-formats/index.html",
+    "title": "Dataset Formats",
+    "section": "",
+    "text": "Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.\nAs there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.\nAxolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.",
     "crumbs": [
-      "Troubleshooting",
-      "Debugging"
+      "Dataset Formats"
     ]
   },
   {
-    "objectID": "docs/debugging.html#debugging-with-vscode",
-    "href": "docs/debugging.html#debugging-with-vscode",
-    "title": "Debugging",
-    "section": "Debugging with VSCode",
-    "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n  - path: &lt;path to your chat_template formatted dataset&gt; # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n    type: chat_template\n\n[!Important]\nIf you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip]\nIf you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n    \"version\": \"0.2.0\",\n    \"configurations\": [\n        {\n            \"name\": \"Debug axolotl prompt - chat_template\",\n            \"type\": \"python\",\n            \"module\": \"accelerate.commands.launch\",\n            \"request\": \"launch\",\n            \"args\": [\n                \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n                // The flags below simplify debugging by overriding the axolotl config\n                // with the debugging tips above.  Modify as needed.\n                \"--dataset_processes=1\",      // limits data preprocessing to one process\n                \"--max_steps=1\",              // limits training to just one step\n                \"--batch_size=1\",             // minimizes batch size\n                \"--micro_batch_size=1\",       // minimizes batch size\n                \"--val_set_size=0\",           // disables validation\n                \"--sample_packing=False\",     // disables sample packing which is necessary for small datasets\n                \"--eval_sample_packing=False\",// disables sample packing on eval set\n                \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n                \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n                ],\n            \"console\": \"integratedTerminal\",      // show output in the integrated terminal\n            \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n            \"justMyCode\": true,                   // step through only axolotl code\n            \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\",  // Since we aren't doing distributed training, we need to limit to one GPU\n                    \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n            \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n        }\n    ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip]\nYou may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n    \"version\": \"2.0.0\",\n    \"tasks\": [\n      // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n      {\n        \"label\": \"delete-outputs\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n      // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n      {\n        \"label\": \"delete-temp-hf-dataset-cache\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n        // this task combines the two tasks above\n      {\n       \"label\": \"cleanup-for-dataprep\",\n       \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n      }\n    ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode",
+    "objectID": "docs/dataset-formats/index.html#pre-training",
+    "href": "docs/dataset-formats/index.html#pre-training",
+    "title": "Dataset Formats",
+    "section": "Pre-training",
+    "text": "Pre-training\nWhen aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.\nA sample format for a pre-training dataset is as follows:\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\nIt is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.\nAxolotl supports loading from a Hugging Face hub repo or from local files.\n\n\n\n\n\n\nImportant\n\n\n\nFor pre-training only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts.\n\n\n\nPre-training from Hugging Face hub datasets\nAs an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:\npretraining_dataset: hf_org/name\n\n\nPre-training from local dataset files\nGiven a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:\npretraining_dataset:\n  - path: json\n    data_files:\n      - A.jsonl\n      - B.jsonl\n      - C.jsonl\nWhile we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset\n\n\nPre-training without streaming\nOn the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.\nOne benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.\nFrom Hugging Face:\ndatasets:\n  - path: hf_org/name\n    type: completion\nFrom local files (either example works):\ndatasets:\n  - path: A.jsonl\n    type: completion\n\n  - path: json\n    data_files: [\"A.jsonl\", \"B.jsonl\", \"C.jsonl\"]\n    type: completion\n\n\nPre-training dataset configuration tips\n\nSetting max_steps\nWhen using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.\nTherefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.\nOne step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.\n\n\nGroup_by_length\nIt is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.\n\n\n\nReference\nPlease see docs here.",
     "crumbs": [
-      "Troubleshooting",
-      "Debugging"
+      "Dataset Formats"
     ]
   },
   {
-    "objectID": "docs/debugging.html#debugging-with-docker",
-    "href": "docs/debugging.html#debugging-with-docker",
-    "title": "Debugging",
-    "section": "Debugging With Docker",
-    "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip]\nIf you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1\n\n[!Tip]\nTo understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, perform an editable install of Axolotl:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host",
+    "objectID": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
+    "href": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
+    "title": "Dataset Formats",
+    "section": "Supervised fine-tuning (SFT)",
+    "text": "Supervised fine-tuning (SFT)\nSupervised fine-tuning is the process of training models to respond to an instruction or chat input.\nAs there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.\nAxolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.\nA flow chart is as follows:\n\nDo you already have the dataset tokenized? If yes, check Pre-Tokenized Dataset.\nDo you want to format the dataset yourself and manually choose each section to mask? If yes, check Template Free Dataset\nIs your dataset in a “conversation” format, containing a list[messages]? If yes, check Conversation Dataset\nIs your dataset in an “instruct” format, containing { instruction, response }? If yes, check Instruction Dataset\n\nIf you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.\n\n\n\n\n\n\nTip\n\n\n\nYou can mix and match within each approach or across approaches to train a model on a variety of datasets.\n\n\n\nPre-Tokenized Dataset\nWe suggest this approach when you want to bring your own tokenized dataset.\nAxolotl expects the dataset to have three keys:\n\ninput_ids: from tokenizing formatted prompt\nattention_mask: for masking padding. If you don’t add padding, it would be equal to len(input_ids) * [1]\nlabels: this is the same as input_ids, however, if you want to mask certain tokens, you would set those indices to -100.\n\n\n\n\n\n\n\nTip\n\n\n\nMake sure to add BOS/EOS tokens to your prompt and mask it appropriately.\n\n\nA config for this would look like:\ndatasets:\n  - path: A.jsonl\n    type:\n\n\n\n\n\n\nNote\n\n\n\ntype: is empty!\n\n\nReference: Pre-Tokenized Dataset Documentation.\n\n\nTemplate Free Dataset\nWe reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.\nIn the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\nEach prompt must be have a key called segments which is a list of { text, label }.\ndatasets:\n  - path: A.jsonl\n    type: input_output\nReference: Template Free Documentation.\n\n\nConversation Dataset\nconversation messages are a list of messages which usually contain a role and content key.\n\n\n\n\n\n\nTip\n\n\n\nFun fact: Axolotl synonymously refers to “chat” messages as conversation messages due to how FastChat initially used this term to build a widely used fastchat conversation method for formatting chat messages prior to the creation of chat_templates.\n\n\n\nWhat are chat_templates?\nThe current most popular and convenient method for inference is to use chat_templates for formatting prompts. Axolotl supports using chat_templates for training to ensure that the model performs in the same environment as in inference.\nHere’s a quick rundown on chat_template: A chat_template is a Jinja2 template which formats a list of messages into a prompt.\nAn example of a prompt formatted into a popular template called ChatML can be seen below:\nSingle prompt (pretty-printed):\n{\n    \"messages\": [\n        {\n            \"role\": \"user\",\n            \"content\": \"Hi\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"How can I help you?\"\n        },\n        {\n            \"role\": \"user\",\n            \"content\": \"Can you add 3+5?\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"The answer is 8.\"\n        }\n    ]\n}\nThe ChatML template is as follows:\n{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'&lt;|im_start|&gt;' + message['role'] + '\\n' + message['content'] + '&lt;|im_end|&gt;' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '&lt;|im_start|&gt;assistant\\n' }}{% endif %}\nThe above prompt formatted into this template will result in:\n&lt;|im_start|&gt;user\nHi&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nHow can I help you?&lt;|im_end|&gt;\n&lt;|im_start|&gt;user\nCan you add 3+5?&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nThe answer is 8.&lt;|im_end|&gt;\nBy using delimiters (&lt;|im_start|&gt; and &lt;|im_end|&gt;), a prompt separates different speakers which helps the model identify which portion belongs to whom.\n\n\nCommon Conversation Dataset formats\nOlder conversation datasets with the following format are colloquially called sharegpt datasets.\n{\"conversations\": [{\"from\": \"...\", \"value\": \"...\"}]}\nNewer conversation datasets usually follow the OpenAI format.\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}]}\nAxolotl supports both as well as allowing customization of any kind of key.\n\n\nChat Template Usage\nTo properly use this method, it is important to identify three things:\n\nWhich chat_template would you use?\nWhat are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be messages, role, and content, respectively, whereas the possible roles are system, user, and assistant.\nWhat do you want to mask? For instance, only assistant messages, only last message, or nothing.\n\n\nChoosing a chat_template\nThere are a lot of chat_templates out there. Axolotl supports the common ones: supported chat templates. For example, to use ChatML, it would be chat_template: chatml.\nHowever, it is also possible to use the already configured template within the tokenizer by specifying chat_template: tokenizer_default. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do chat_template: tokenizer_default_fallback_chatml to fallback to the ChatML template if a tokenizer template was not found.\nOne last but powerful approach is to bring your own template. This can be set via:\nchat_template_jinja: # your template\n\n\nSetting chat_template dataset keys\nWe currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.\nIf your dataset format is different, here are the keys you should check (with their defaults):\ndatasets:\n    ...\n    field_messages: messages  # this should point to the key containing the list of conversations\n    message_property_mappings:  # this is a mapping from keys in your dataset to keys in chat_template\n      role: role\n      content: content\nIn some chat_templates (e.g. Gemma), the roles are hardcoded to user and assistant. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a KeyError, it would be necessary to add mapping for your roles. Here is an example of how it would look like:\ndatasets:\n    ...\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\nIn the example above, all gpt and model values are converted to assistant. All human values are converted to user.\n\n\nHandling masking\nThe common use case for chat_template is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.\nTo train on all assistant messages, you would set the following configs.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\nThe train_on_eos config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: all and last to choose which EOS to train on.\nPerhaps, you want to train on assistant and narrator roles, you can simply add narrator to the list of roles_to_train. You would also need to add it to the mapping of roles above.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\", \"narrator\"]\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\n      narrator: [\"narrator\"]\n\n\n\n\n\n\nTip\n\n\n\nAs chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, ChatML uses &lt;|im_end|&gt; to end turns.\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\n\n\n\n\nApplying chat_template\nOnce all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.\ndatasets:\n  - path: A.jsonl\n    type: chat_template\n\n    # step 1\n    chat_template: chatml\n\n    # step 2\n    field_messages: messages\n    message_property_mappings:\n      role: role\n      content: content\n\n    roles:\n      assistant:\n        - gpt\n        - model\n        - assistant\n      user:\n        - human\n        - user\n\n    # step 3\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\n\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\nIf this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via axolotl preprocess config.yaml --debug):\n&lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Hi(-100, 13347) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) How(4438, 4438)  can(649, 649)  I(358, 358)  help(1520, 1520)  you(499, 499) ?(30, 30) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Can(-100, 6854)  you(-100, 499)  add(-100, 923)  (-100, 220) 3(-100, 18) +(-100, 10) 5(-100, 20) ?(-100, 30) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) The(791, 791)  answer(4320, 4320)  is(374, 374)  (220, 220) 8(23, 23) .(13, 13) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198)\nThe first number refers to the label, the second refers to the token_id. For example, -100 labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the token_id.\n\n\n\n\n\n\nNote\n\n\n\nIf during preprocess, there are a lot of warnings of Could not find content __ boundary, please check the FAQ section for chat_templates.\n\n\n\n\n\nReference\nPlease see docs here.\n\n\n\nInstruction Dataset\nInstruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.\nAn example is of a common format called Alpaca:\n{\"instruction\": \"...\", \"input\": \"...\", \"output\": \"...\"}\nUsing those keys, a prompt can be built based on it.\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{output}\nThis can be configured as such:\ndatasets:\n  - path: A.jsonl\n    type: alpaca\nAxolotl supports many kinds of instruction dataset. All of them can be found in the Instruction Dataset Documentation with their respective type and sample row format.\n\nCustom Instruct Prompt Format\nDue to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.\nIn the example below, a sample row is used to output in mistral_v1 format.\n{\"input\": \"...\", \"output\": \"...\"}\ndatasets:\n  - path: repo\n    type:\n      system_prompt: \"\"\n\n      field_system:\n      field_instruction: input\n      field_input:\n      field_output: output\n\n      # multi-line example with input\n      format: |-\n        [INST] {instruction} {input} [/INST]\n\n      # single-line example without input\n      no_input_format: \"[INST] {instruction} [/INST]\"\nThe config sets that the field_instruction is actually named input, and the field_input is empty as we don’t have an input in this sample. Generally, instruction can be thought as the question to the model, and input as the additional information with output being the response. It is not necessary to have an input nor system. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.\nReference: Custom Instruct Prompt Format Documentation.",
     "crumbs": [
-      "Troubleshooting",
-      "Debugging"
+      "Dataset Formats"
     ]
   },
   {
-    "objectID": "docs/debugging.html#footnotes",
-    "href": "docs/debugging.html#footnotes",
-    "title": "Debugging",
+    "objectID": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
+    "href": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
+    "title": "Dataset Formats",
+    "section": "Reinforcement Learning from Human Feedback (RLHF)",
+    "text": "Reinforcement Learning from Human Feedback (RLHF)\nAs there are multiple RLHF methods with their own dataset requirements. Please see RLHF documentation for more detail.",
+    "crumbs": [
+      "Dataset Formats"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/template_free.html",
+    "href": "docs/dataset-formats/template_free.html",
+    "title": "Template-Free",
+    "section": "",
+    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
+    "crumbs": [
+      "Dataset Formats",
+      "Template-Free"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/template_free.html#sec-background",
+    "href": "docs/dataset-formats/template_free.html#sec-background",
+    "title": "Template-Free",
+    "section": "",
+    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
+    "crumbs": [
+      "Dataset Formats",
+      "Template-Free"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/template_free.html#sec-usage",
+    "href": "docs/dataset-formats/template_free.html#sec-usage",
+    "title": "Template-Free",
+    "section": "Usage",
+    "text": "Usage\nThis is how you can use the input_output format:\n\n1. Prepare Data\nTo use the input_output format, collect your data in the following\nformat into a jsonl file (below is the first row from the file\noutput.jsonl` pretty printed):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\n\nSet label:false when you want to mask a segment of text so that the\nmodel isn’t trained on it. Some things to keep in mind:\n\n[!IMPORTANT]\n1. EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl\nconcatenates all the segments as-is. The tokenizer doesn’t add\nanything additional. Notice how I added spaces, newlines, &lt;s&gt;\n(BOS), and &lt;/s&gt; (EOS) myself.\n2. Make sure you check the materialized output to validate that the\nprompt is getting assembled how you like.\n\n\n\n2. Use type: input_output\nLet’s materialize data with our output.jsonl file by setting\ntype: input_output in our axolotl config:\n# training_config.yaml\nbase_model: mistralai/Mistral-7B-v0.1\ndata_seed: 49\nseed: 49\n\ndatasets:\n  - path: output.jsonl\n    type: input_output\nval_set_size: 0.1\n\nsequence_len: 896\nsample_packing: false\n\nmicro_batch_size: 2\ngradient_accumulation_steps: 3\neval_batch_size: 2\nnum_epochs: 1\nlearning_rate: 0.0002\n\ntrain_on_inputs: false\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\nYou can use the following command to materialize your data. The\n--debug flag will print the tokens, along with the labels so you can\nverify that the correct items are being ignored:\naxolotl preprocess training_config.yaml --debug\n\n...\n[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] &lt;s&gt;(1, 1) Hello(22557, 22557)\n(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) &lt;/s&gt;(2, 2)\nThe format is decoded_token(label, token_id), for example,\n&lt;s&gt;(1, 1) means that the token is &lt;s&gt;, the label is 1 and the\ntoken_id is 1. When the label is -100 then that token is ignored for\ntraining.\n\n\n3. Check the prompts\nHere is another way to check the materialized output:\nfrom transformers import AutoTokenizer\nfrom datasets import load_from_disk\nimport yaml\n\ndirectory = !ls last_run_prepared/\nwith open('training_config.yaml', 'r') as f:\n    cfg = yaml.safe_load(f)\nmodel_id = cfg['base_model']\ntok = AutoTokenizer.from_pretrained(model_id)\nds = load_from_disk(f'last_run_prepared/{directory[0]}/')\n&gt;&gt;&gt; row = ds[0]\n&gt;&gt;&gt; print(tok.decode(row['input_ids']))\n&lt;s&gt; Hello\n    hi there!.  goodbye  farewell&lt;/s&gt;\nWe can check that the right tokens are ignored by comparing the labels\nto each token:\nimport pandas as pd\npd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in\n              zip(row['input_ids'], row['labels'])])\n\n\n\ntoken\nlabel\nid\n\n\n\n\n0\n&lt;s&gt;\n1\n\n\n1\nHello\n22557\n\n\n2\n\\n\n13\n\n\n3\nhi\n12014\n\n\n4\nthere\n736\n\n\n5\n!\n28808\n\n\n6\n.\n28723\n\n\n7\n\n28705\n\n\n8\ngood\n-100\n\n\n9\nbye\n-100\n\n\n10\n\n-100\n\n\n11\nfare\n19111\n\n\n12\nwell\n5458\n\n\n13\n&lt;/s&gt;\n2\n\n\n\nIf we look at the input data, the above table seems correct! (The jsonl\nversion is repeated below for reference):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}",
+    "crumbs": [
+      "Dataset Formats",
+      "Template-Free"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/pretraining.html",
+    "href": "docs/dataset-formats/pretraining.html",
+    "title": "Pre-training",
+    "section": "",
+    "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset:\n  - name:\n    path:\n    split:\n    text_column: # column in dataset with the data, usually `text`\n    type: pretrain\n    trust_remote_code:\n    skip: # number of rows of data to skip over from the beginning",
+    "crumbs": [
+      "Dataset Formats",
+      "Pre-training"
+    ]
+  },
+  {
+    "objectID": "docs/batch_vs_grad.html",
+    "href": "docs/batch_vs_grad.html",
+    "title": "Batch size vs Gradient accumulation",
+    "section": "",
+    "text": "Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn’t significantly impact learning.\nThis method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here’s why:\n\nMemory Consumption with Batch Size: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.\nGradient Accumulation: With gradient accumulation, you’re effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you’re only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.\n\nExample 1:\nMicro batch size: 3\nGradient accumulation steps: 2\nNumber of GPUs: 3\nTotal batch size = 3 * 2 * 3 = 18\n| GPU 1          | GPU 2          | GPU 3          |\n|----------------|----------------|----------------|\n| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |\n| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |\n|----------------|----------------|----------------|\n| → (accumulate) | → (accumulate) | → (accumulate) |\n|----------------|----------------|----------------|\n| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |\n| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |\n|----------------|----------------|----------------|\n| → (apply)      | → (apply)      | → (apply)      |\n\nAccumulated gradient for the weight w1 after the second iteration (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18\n\nWeight update for w1:\nw1_new = w1_old - learning rate x (Total gradient for w1 / 18)\nExample 2:\nMicro batch size: 2\nGradient accumulation steps: 1\nNumber of GPUs: 3\nTotal batch size = 2 * 1 * 3 = 6\n| GPU 1     | GPU 2     | GPU 3     |\n|-----------|-----------|-----------|\n| S1, S2    | S3, S4    | S5, S6    |\n| e1, e2    | e3, e4    | e5, e6    |\n|-----------|-----------|-----------|\n| → (apply) | → (apply) | → (apply) |\n\nAccumulated gradient for the weight w1 (considering all GPUs):\nTotal gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6\n\nWeight update for w1:\nw1_new = w1_old - learning rate × (Total gradient for w1 / 6)",
+    "crumbs": [
+      "Core Concepts",
+      "Batch size vs Gradient accumulation"
+    ]
+  },
+  {
+    "objectID": "docs/lr_groups.html",
+    "href": "docs/lr_groups.html",
+    "title": "Learning Rate Groups",
+    "section": "",
+    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
+    "crumbs": [
+      "How To Guides",
+      "Learning Rate Groups"
+    ]
+  },
+  {
+    "objectID": "docs/lr_groups.html#background",
+    "href": "docs/lr_groups.html#background",
+    "title": "Learning Rate Groups",
+    "section": "",
+    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
+    "crumbs": [
+      "How To Guides",
+      "Learning Rate Groups"
+    ]
+  },
+  {
+    "objectID": "docs/lr_groups.html#example",
+    "href": "docs/lr_groups.html#example",
+    "title": "Learning Rate Groups",
+    "section": "Example",
+    "text": "Example\nlr_groups:\n  - name: o_proj\n    modules:\n      - self_attn.o_proj.weight\n    lr: 1e-6\n  - name: q_proj\n    modules:\n      - model.layers.2.self_attn.q_proj.weight\n    lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate\nof 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s\nself attention q_proj module.",
+    "crumbs": [
+      "How To Guides",
+      "Learning Rate Groups"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html",
+    "href": "docs/fsdp_qlora.html",
+    "title": "FDSP + QLoRA",
+    "section": "",
+    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#background",
+    "href": "docs/fsdp_qlora.html#background",
+    "title": "FDSP + QLoRA",
+    "section": "",
+    "text": "Using FSDP with QLoRA is essential for fine-tuning larger (70b+ parameter) LLMs on consumer GPUs. For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs1.\nBelow, we describe how to use this feature in Axolotl.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#usage",
+    "href": "docs/fsdp_qlora.html#usage",
+    "title": "FDSP + QLoRA",
+    "section": "Usage",
+    "text": "Usage\nTo enable QLoRA with FSDP, you need to perform the following steps:\n\n![Tip]\nSee the example config file in addition to reading these instructions.\n\n\nSet adapter: qlora in your axolotl config file.\nEnable FSDP in your axolotl config, as described here.\nUse one of the supported model types: llama, mistral or mixtral.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#example-config",
+    "href": "docs/fsdp_qlora.html#example-config",
+    "title": "FDSP + QLoRA",
+    "section": "Example Config",
+    "text": "Example Config\nexamples/llama-2/qlora-fsdp.yml contains an example of how to enable QLoRA + FSDP in axolotl.",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#references",
+    "href": "docs/fsdp_qlora.html#references",
+    "title": "FDSP + QLoRA",
+    "section": "References",
+    "text": "References\n\nPR #1378 enabling QLoRA in FSDP in Axolotl.\nBlog Post from the Answer.AI team describing the work that enabled QLoRA in FSDP.\nRelated HuggingFace PRs Enabling FDSP + QLoRA:\n\nAccelerate PR#2544\nTransformers PR#29587\nTRL PR#1416\nPEFT PR#1550",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/fsdp_qlora.html#footnotes",
+    "href": "docs/fsdp_qlora.html#footnotes",
+    "title": "FDSP + QLoRA",
     "section": "Footnotes",
-    "text": "Footnotes\n\n\nThe config actually mimics the command CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml, but this is the same thing.↩︎\nMany of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags here.↩︎",
+    "text": "Footnotes\n\n\nThis was enabled by this work from the Answer.AI team.↩︎",
+    "crumbs": [
+      "Advanced Features",
+      "FDSP + QLoRA"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html",
+    "href": "docs/sequence_parallelism.html",
+    "title": "Sequence Parallelism",
+    "section": "",
+    "text": "Sequence parallelism is a technique that splits sequences across multiple GPUs,\nallowing you to train with very long sequences that wouldn’t fit on a single GPU. Each\nGPU processes a different portion of the sequence, and the results are aggregated\nthrough a ring communication pattern.",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
+    "href": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
+    "title": "Sequence Parallelism",
+    "section": "When to Use Sequence Parallelism",
+    "text": "When to Use Sequence Parallelism\nUse sequence parallelism when:\n\nYou need to train with sequence lengths that don’t fit into a single GPU’s memory\nYou have multiple GPUs available\nYou’re experiencing OOM (Out Of Memory) errors with long sequences",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#configuration",
+    "href": "docs/sequence_parallelism.html#configuration",
+    "title": "Sequence Parallelism",
+    "section": "Configuration",
+    "text": "Configuration\nTo enable sequence parallelism, add the following to your configuration file:\n# Set to a divisor (&gt; 1) of the number of GPUs available\nsequence_parallel_degree: 4  # Split sequences across 4 GPUs\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n# Optional; one of \"varlen_llama3\" or \"batch_ring\". Defaults to\n# \"varlen_llama3\" when `sample_packing: true`, and \"batch_ring\" otherwise.\nring_attn_func:\nThe sequence_parallel_degree should be a divisor of the total number of GPUs. For example:\n\nWith 8 GPUs, valid values would be 2, 4, or 8\nWith 4 GPUs, valid values would be 2 or 4",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#implementation-details",
+    "href": "docs/sequence_parallelism.html#implementation-details",
+    "title": "Sequence Parallelism",
+    "section": "Implementation Details",
+    "text": "Implementation Details\nWhen sequence parallelism is enabled:\n\nEach sequence is divided into equal chunks across the GPUs in a sequence parallel group\nThe data collator handles the chunking of input_ids, attention_mask, labels, and position_ids\nPosition IDs are adjusted to maintain proper relative positions, especially for packed sequences\nThe trainer uses special ring communication patterns for attention operations",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#requirements",
+    "href": "docs/sequence_parallelism.html#requirements",
+    "title": "Sequence Parallelism",
+    "section": "Requirements",
+    "text": "Requirements\nTo use sequence parallelism, you need:\n\nMultiple GPUs (at least 2)\nThe ring-flash-attn package. Install with:\n\npip install axolotl[ring-flash-attn] (preferred)\npip install ring-flash-attn&gt;=0.1.4",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#limitations",
+    "href": "docs/sequence_parallelism.html#limitations",
+    "title": "Sequence Parallelism",
+    "section": "Limitations",
+    "text": "Limitations\n\nFlash attention must be enabled for this to work (flash_attention: true in config YAML)\nMay have a small performance overhead due to communication between GPUs",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#example",
+    "href": "docs/sequence_parallelism.html#example",
+    "title": "Sequence Parallelism",
+    "section": "Example",
+    "text": "Example\nbase_model: meta-llama/Llama-3-8B-Instruct\nsequence_len: 8192\n\n...\n\nsequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU\nflash_attention: true  # Required with sequence parallelism\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n\n...\nThis will train the Llama 3 8B model with 8K context length, with each sequence split\ninto 2 subsequences of length 4096 across 2 GPUs.",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
+    "href": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
+    "title": "Sequence Parallelism",
+    "section": "Sample Packing with Sequence Parallelism",
+    "text": "Sample Packing with Sequence Parallelism\nSequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:\n\nSamples are first packed together\nThe packed sequences are then divided across GPUs in the sequence parallel group\nPosition IDs are automatically adjusted to maintain proper relative positions",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#effect-on-batch-size",
+    "href": "docs/sequence_parallelism.html#effect-on-batch-size",
+    "title": "Sequence Parallelism",
+    "section": "Effect on Batch Size",
+    "text": "Effect on Batch Size\nWhen using sequence parallelism, your effective global batch size is divided by the sequence_parallel_degree. This happens because:\n\nEach group of sequence_parallel_degree GPUs works on the same batch (just different parts of each sequence)\nThe number of batches processed per step decreases\n\nFor example:\n- With 8 GPUs and no sequence parallelism: 8 different batches processed per step\n- With 8 GPUs and sequence_parallel_degree=4: Only 2 different batches processed per step (each split across 4 GPUs)\n- If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/amd_hpc.html",
+    "href": "docs/amd_hpc.html",
+    "title": "AMD GPUs on HPC Systems",
+    "section": "",
+    "text": "This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.",
+    "crumbs": [
+      "Deployments",
+      "AMD GPUs on HPC Systems"
+    ]
+  },
+  {
+    "objectID": "docs/amd_hpc.html#setup",
+    "href": "docs/amd_hpc.html#setup",
+    "title": "AMD GPUs on HPC Systems",
+    "section": "Setup",
+    "text": "Setup\n\n1. Install Python\nWe recommend using Miniforge, a minimal conda-based Python distribution:\ncurl -L -O \"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh\"\nbash Miniforge3-$(uname)-$(uname -m).sh\n\n\n2. Configure Python Environment\nAdd Python to your PATH and ensure it’s available at login:\necho 'export PATH=~/miniforge3/bin:$PATH' &gt;&gt; ~/.bashrc\necho 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' &gt;&gt; ~/.bash_profile\n\n\n3. Load AMD GPU Software\nLoad the ROCm module:\nmodule load rocm/5.7.1\nNote: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.\n\n\n4. Install PyTorch\nInstall PyTorch with ROCm support:\npip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall\n\n\n5. Install Flash Attention\nClone and install the Flash Attention repository:\ngit clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git\nexport GPU_ARCHS=\"gfx90a\"\ncd flash-attention\nexport PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')\npatch \"${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py\" hipify_patch.patch\npip install --no-build-isolation .\n\n\n6. Install Axolotl\nClone and install Axolotl:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\npip install packaging ninja\npip install --no-build-isolation -e .\n\n\n7. Apply xformers Workaround\nxformers appears to be incompatible with ROCm. Apply the following workarounds:\n- Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers.\n- Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.\n\n\n8. Prepare Job Submission Script\nCreate a script for job submission using your HPC’s particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include\nexport TRANSFORMERS_OFFLINE=1\nexport HF_DATASETS_OFFLINE=1\n\n\n9. Download Base Model\nDownload a base model using the Hugging Face CLI:\nhuggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B\n\n\n10. Create Axolotl Configuration\nCreate an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.\nNote: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.\n\n\n11. Preprocess Data\nRun preprocessing on the login node:\nCUDA_VISIBLE_DEVICES=\"\" python -m axolotl.cli.preprocess /path/to/your/config.yaml\n\n\n12. Train\nYou are now ready to submit your previously prepared job script. 🚂",
+    "crumbs": [
+      "Deployments",
+      "AMD GPUs on HPC Systems"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html",
+    "href": "docs/ray-integration.html",
+    "title": "Ray Train",
+    "section": "",
+    "text": "Axolotl supports using Ray as an alternative to accelerate for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.\nWith the --use-ray CLI flag, Axolotl will use Ray Train’s TorchTrainer to run training.",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#ray-cluster-setup",
+    "href": "docs/ray-integration.html#ray-cluster-setup",
+    "title": "Ray Train",
+    "section": "Ray cluster setup",
+    "text": "Ray cluster setup\nA prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs here.\nEvery Ray cluster has one head node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this doc.",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#sanity-check",
+    "href": "docs/ray-integration.html#sanity-check",
+    "title": "Ray Train",
+    "section": "Sanity check",
+    "text": "Sanity check\nTo run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:\nray status\nThe output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:\nNode status\n---------------------------------------------------------------\nActive:\n 1 head\nIdle:\n 2 4xL40S:48CPU-384GB\nPending:\n (no pending nodes)\nRecent failures:\n (no failures)\n\nResources\n---------------------------------------------------------------\nUsage:\n 0.0/96.0 CPU\n 0.0/8.0 GPU\n 0B/800.00GiB memory\n 0B/229.57GiB object_store_memory\n\nDemands:\n (no resource demands)\nYou should also be able to see the same on the Ray dashboard.",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#configuring-training-with-ray-train",
+    "href": "docs/ray-integration.html#configuring-training-with-ray-train",
+    "title": "Ray Train",
+    "section": "Configuring training with Ray Train",
+    "text": "Configuring training with Ray Train\nYou can find an example configuration at configs/llama-3/lora-1b-ray.yaml.\nThe key parameters to note here are:\nuse_ray: true\nray_num_workers: 4\n# optional\nresources_per_worker:\n    GPU: 1\n\nuse_ray: This is the flag that enables the Ray Train integration. You can either use the corresponding --use-ray flag in the CLI or set use_ray in the config file.\nray_num_workers: This is the number of workers/GPUs to use for training.\nresources_per_worker: This is the Ray resource request for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do\n\nresources_per_worker:\n    accelerator_type:L40S: 0.001",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#launching-training",
+    "href": "docs/ray-integration.html#launching-training",
+    "title": "Ray Train",
+    "section": "Launching training",
+    "text": "Launching training\nYou can simply run the following command on the head node:\naxolotl train examples/llama-3/lora-1b-ray.yml --use-ray\nThis will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.\nYou can also monitor training progress on the Ray dashboard.\nComing back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let’s say you want to make use of all 8 GPUs. You would be able to just set ray_num_workers: 8 and run the previous command. The Cluster tab will show the following:\n\n\n\nRay dashboard",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_preprocessing.html",
+    "href": "docs/dataset_preprocessing.html",
+    "title": "Dataset Preprocessing",
+    "section": "",
+    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "crumbs": [
+      "Core Concepts",
+      "Dataset Preprocessing"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_preprocessing.html#overview",
+    "href": "docs/dataset_preprocessing.html#overview",
+    "title": "Dataset Preprocessing",
+    "section": "",
+    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "crumbs": [
+      "Core Concepts",
+      "Dataset Preprocessing"
+    ]
+  },
+  {
+    "objectID": "docs/faq.html",
+    "href": "docs/faq.html",
+    "title": "FAQ",
+    "section": "",
+    "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: Exitcode -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: Exitcode -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\nQ: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?\n\nA: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_&lt;model_name&gt;.py file within transformers library.\n\nQ: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token\n\nA: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:\n\n\nspecial_tokens:\n  # str. If you're not sure, set to same as `eos_token`.\n  pad_token: \"...\"\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS token is incorrectly being masked or not being masked / EOS token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThis is because of the mismatch between tokenizer.eos_token and EOS token in template. Please make sure to set eos_token: under special_tokens: to the same EOS token as in template.\n\n\n\n\nThe EOS token is not in the template. Please check if your template is correct. As an example, phi_35 template does not use its dedicated EOS token &lt;|endoftext|&gt; at the end.\n\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.\n\nQ: The EOT token(s) are incorrectly being masked or not being masked / EOT token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThe EOT token is different from the EOS token and was not specified under eot_tokens:. Please set eot_tokens: to the same EOT token(s) as in template.\n\n\n\n\nThere is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.\n\n\nQ: EOT token encoding failed. Please check if the token is valid and can be encoded.\n\nA: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.\n\nQ: EOT token __ is encoded as multiple tokens.\n\nA: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:.\n\nQ: Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot\n\nA: This is because the EOS token is in the eot_tokens: while mismatch between train_on_eos: and train_on_eot:. This will cause one to override the other. Please ensure that train_on_eos: and train_on_eot: are the same or remove the EOS token from eot_tokens:.\n\nQ: If eot_tokens: is not provided, what happens?\n\nA: If eot_tokens: is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.\n\n\nInternally, eot_tokens: tokenizer.eos_token and train_on_eot: train_on_eos (which defaults to turn). This transition helps clarify the naming and behavior of EOT/EOS tokens.",
     "crumbs": [
       "Troubleshooting",
-      "Debugging"
+      "FAQ"
     ]
   },
   {
-    "objectID": "docs/multipack.html",
-    "href": "docs/multipack.html",
-    "title": "Multipack (Sample Packing)",
-    "section": "",
-    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
-    "crumbs": [
-      "Core Concepts",
-      "Multipack (Sample Packing)"
-    ]
-  },
-  {
-    "objectID": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
-    "href": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
-    "title": "Multipack (Sample Packing)",
-    "section": "",
-    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
-    "crumbs": [
-      "Core Concepts",
-      "Multipack (Sample Packing)"
-    ]
-  },
-  {
-    "objectID": "docs/multipack.html#multipack-without-flash-attention",
-    "href": "docs/multipack.html#multipack-without-flash-attention",
-    "title": "Multipack (Sample Packing)",
-    "section": "Multipack without Flash Attention",
-    "text": "Multipack without Flash Attention\nMultipack can still be achieved without Flash attention, but with lower packing\nefficiency as we are not able to join multiple batches into a single batch due to\ncontext length limits without flash attention. We can use either Pytorch’s Scaled\nDot Product Attention implementation or native Pytorch attention implementation\nalong with 4d attention masks\nto pack sequences together and avoid cross attention.",
-    "crumbs": [
-      "Core Concepts",
-      "Multipack (Sample Packing)"
-    ]
-  },
-  {
-    "objectID": "docs/docker.html",
-    "href": "docs/docker.html",
-    "title": "Docker",
-    "section": "",
-    "text": "This section describes the different Docker images that are released by AxolotlAI at Docker Hub.",
-    "crumbs": [
-      "Deployments",
-      "Docker"
-    ]
-  },
-  {
-    "objectID": "docs/docker.html#base",
-    "href": "docs/docker.html#base",
-    "title": "Docker",
-    "section": "Base",
-    "text": "Base\nThe base image is the most minimal image that can install Axolotl. It is based on the nvidia/cuda image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.\n\nImage\naxolotlai/axolotl-base\nLink: Docker Hub\n\n\nTags format\nmain-base-py{python_version}-cu{cuda_version}-{pytorch_version}\nTags examples:\n\nmain-base-py3.11-cu128-2.7.0\nmain-base-py3.11-cu126-2.7.0\nmain-base-py3.11-cu124-2.6.0\nmain-base-py3.11-cu124-2.5.1\nmain-base-py3.11-cu124-2.4.1",
-    "crumbs": [
-      "Deployments",
-      "Docker"
-    ]
-  },
-  {
-    "objectID": "docs/docker.html#main",
-    "href": "docs/docker.html#main",
-    "title": "Docker",
-    "section": "Main",
-    "text": "Main\nThe main image is the image that is used to run Axolotl. It is based on the axolotlai/axolotl-base image and includes the Axolotl codebase, dependencies, and more.\n\nImage\naxolotlai/axolotl\nLink: Docker Hub\n\n\nTags format\n# on push to main\nmain-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)\nmain-latest\n\n# nightly build\n{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}\n\n# tagged release\n{version}\n\n\n\n\n\n\nTip\n\n\n\nThere may be some extra tags appended to the image, like -vllm which installs those packages.\n\n\nTags examples:\n\nmain-py3.11-cu126-2.7.0\nmain-py3.11-cu124-2.6.0\nmain-py3.11-cu124-2.5.1\nmain-py3.11-cu124-2.4.1\nmain-latest\nmain-20250303-py3.11-cu124-2.6.0\nmain-20250303-py3.11-cu124-2.5.1\nmain-20250303-py3.11-cu124-2.4.1\n0.7.1",
-    "crumbs": [
-      "Deployments",
-      "Docker"
-    ]
-  },
-  {
-    "objectID": "docs/docker.html#cloud",
-    "href": "docs/docker.html#cloud",
-    "title": "Docker",
-    "section": "Cloud",
-    "text": "Cloud\nThe cloud image is the image that is used to run Axolotl in the cloud. It is based on the axolotlai/axolotl image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.\n\n\n\n\n\n\nTip\n\n\n\nJupyter lab is run by default. Set JUPYTER_DISABLE=1 in the environment variables to disable it.\n\n\n\nImage\naxolotlai/axolotl-cloud\nLink: Docker Hub\n\n\nTags format\nThis uses the same tags as the main image.\n\n\nEnvironment variables\n\nJUPYTER_DISABLE: Disable Jupyter lab.\nJUPYTER_PASSWORD: Set a password for the Jupyter lab.\nPUBLIC_KEY / SSH_KEY: Add a public key for the SSH service.\n\n\n\nVolume mounts\n\n\n\n\n\n\nTip\n\n\n\nWe recommend mounting volumes to /workspace/data for data persistence. /workspace/axolotl contains the source code and is ephemeral.\n\n\n\n/workspace/data/axolotl-artifacts: Directory to store Axolotl artifacts.\n/workspace/data/huggingface-cache: Directory to store HuggingFace cache.",
-    "crumbs": [
-      "Deployments",
-      "Docker"
-    ]
-  },
-  {
-    "objectID": "docs/docker.html#cloud-no-tmux",
-    "href": "docs/docker.html#cloud-no-tmux",
-    "title": "Docker",
-    "section": "Cloud-no-tmux",
-    "text": "Cloud-no-tmux\nThis is the same as the cloud image but without tmux.\n\nImage\naxolotlai/axolotl-cloud-term\nLink: Docker Hub\n\n\n\n\n\n\nNote\n\n\n\nThe naming may be a bit confusing as it has -term appended to the end.\n\n\n\n\nTags format\nThis uses the same tags as the cloud image.",
-    "crumbs": [
-      "Deployments",
-      "Docker"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html",
-    "href": "docs/installation.html",
-    "title": "Installation",
-    "section": "",
-    "text": "This guide covers all the ways you can install and set up Axolotl for your environment.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-requirements",
-    "href": "docs/installation.html#sec-requirements",
-    "title": "Installation",
-    "section": "1 Requirements",
-    "text": "1 Requirements\n\nNVIDIA GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU\nPython ≥3.10\nPyTorch ≥2.4.1",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-installation-methods",
-    "href": "docs/installation.html#sec-installation-methods",
-    "title": "Installation",
-    "section": "2 Installation Methods",
-    "text": "2 Installation Methods\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure to have Pytorch installed before installing Axolotl in your local environment.\nFollow the instructions at: https://pytorch.org/get-started/locally/\n\n\n\n2.1 PyPI Installation (Recommended)\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nWe use --no-build-isolation in order to detect the installed PyTorch version (if\ninstalled) in order not to clobber it, and so that we set the correct version of\ndependencies that are specific to the PyTorch version or other installed\nco-dependencies.\n\n\n2.2 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\n2.3 Docker\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n  --name axolotl --ipc=host \\\n  --ulimit memlock=-1 --ulimit stack=67108864 \\\n  --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n  axolotlai/axolotl:main-latest\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-cloud",
-    "href": "docs/installation.html#sec-cloud",
-    "title": "Installation",
-    "section": "3 Cloud Environments",
-    "text": "3 Cloud Environments\n\n3.1 Cloud GPU Providers\nFor providers supporting Docker:\n\nUse axolotlai/axolotl-cloud:main-latest\nAvailable on:\n\nLatitude.sh\nJarvisLabs.ai\nRunPod\nNovita\n\n\n\n\n3.2 Google Colab\nUse our example notebook.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-platform-specific",
-    "href": "docs/installation.html#sec-platform-specific",
-    "title": "Installation",
-    "section": "4 Platform-Specific Instructions",
-    "text": "4 Platform-Specific Instructions\n\n4.1 macOS\npip3 install --no-build-isolation -e '.'\nSee Section 6 for Mac-specific issues.\n\n\n4.2 Windows\n\n\n\n\n\n\nImportant\n\n\n\nWe recommend using WSL2 (Windows Subsystem for Linux) or Docker.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-env-managers",
-    "href": "docs/installation.html#sec-env-managers",
-    "title": "Installation",
-    "section": "5 Environment Managers",
-    "text": "5 Environment Managers\n\n5.1 Conda/Pip venv\n\nInstall Python ≥3.10\nInstall PyTorch: https://pytorch.org/get-started/locally/\nInstall Axolotl:\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n(Optional) Login to Hugging Face:\nhuggingface-cli login",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-troubleshooting",
-    "href": "docs/installation.html#sec-troubleshooting",
-    "title": "Installation",
-    "section": "6 Troubleshooting",
-    "text": "6 Troubleshooting\nIf you encounter installation issues, see our FAQ and Debugging Guide.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html",
-    "href": "docs/multi-gpu.html",
-    "title": "Multi-GPU",
-    "section": "",
-    "text": "This guide covers advanced training configurations for multi-GPU setups using Axolotl.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-overview",
-    "href": "docs/multi-gpu.html#sec-overview",
-    "title": "Multi-GPU",
-    "section": "1 Overview",
-    "text": "1 Overview\nAxolotl supports several methods for multi-GPU training:\n\nDeepSpeed (recommended)\nFSDP (Fully Sharded Data Parallel)\nSequence parallelism\nFSDP + QLoRA",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-deepspeed",
-    "href": "docs/multi-gpu.html#sec-deepspeed",
-    "title": "Multi-GPU",
-    "section": "2 DeepSpeed",
-    "text": "2 DeepSpeed\nDeepSpeed is the recommended approach for multi-GPU training due to its stability and performance. It provides various optimization levels through ZeRO stages.\n\n2.1 Configuration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\n2.2 Usage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\n2.3 ZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-fsdp",
-    "href": "docs/multi-gpu.html#sec-fsdp",
-    "title": "Multi-GPU",
-    "section": "3 FSDP",
-    "text": "3 FSDP\n\n3.1 Basic FSDP Configuration\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-sequence-parallelism",
-    "href": "docs/multi-gpu.html#sec-sequence-parallelism",
-    "title": "Multi-GPU",
-    "section": "4 Sequence parallelism",
-    "text": "4 Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nFirst, install ring-flash-attn, recommended via pip install axolotl[ring-flash-attn],\nor from source with pip install .[ring-flash-attn].\nYour Axolotl YAML config should contain the following lines:\nsequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU\nflash_attention: true  # Required with sequence parallelism\n\n# Optional; strides across the key dimension. Larger values use more memory but will make training faster.\nheads_k_stride: 1\nSee our dedicated guide for more details.\n\n4.1 FSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-performance",
-    "href": "docs/multi-gpu.html#sec-performance",
-    "title": "Multi-GPU",
-    "section": "5 Performance Optimization",
-    "text": "5 Performance Optimization\n\n5.1 Liger Kernel Integration\nPlease see docs for more info.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/multi-gpu.html#sec-troubleshooting",
-    "href": "docs/multi-gpu.html#sec-troubleshooting",
-    "title": "Multi-GPU",
-    "section": "6 Troubleshooting",
-    "text": "6 Troubleshooting\n\n6.1 NCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\n6.2 Common Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
-    "crumbs": [
-      "Deployments",
-      "Multi-GPU"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html",
-    "href": "docs/dataset_loading.html",
-    "title": "Dataset Loading",
-    "section": "",
-    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html#overview",
-    "href": "docs/dataset_loading.html#overview",
-    "title": "Dataset Loading",
-    "section": "",
-    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html#loading-datasets",
-    "href": "docs/dataset_loading.html#loading-datasets",
-    "title": "Dataset Loading",
-    "section": "Loading Datasets",
-    "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n  - path:\n    name:\n    data_files:\n    split:\n    revision:\n    trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n  - path: /path/to/your/dataset\n  - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nUsually, to load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n  - path: json\n    data_files: /path/to/your/file.jsonl\nHowever, to make things easier, we have added a few shortcuts for loading local dataset files.\nYou can just point the path to the file or directory along with the ds_type to load the dataset. The below example shows for a JSON file:\ndatasets:\n  - path: /path/to/your/file.jsonl\n    ds_type: json\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n  - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n    # single file\n  - path: /path/to/your/directory\n    ds_type: csv\n    data_files: file1.csv\n\n    # multiple files\n  - path: /path/to/your/directory\n    ds_type: json\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n    # multiple files for parquet\n  - path: /path/to/your/directory\n    ds_type: parquet\n    data_files:\n      - file1.parquet\n      - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n  - path: org/dataset-name\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n  - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n    # Single file\n  - path: s3://bucket-name/path/to/your/file.jsonl\n\n    # Directory\n  - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n  - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html#next-steps",
-    "href": "docs/dataset_loading.html#next-steps",
-    "title": "Dataset Loading",
-    "section": "Next steps",
-    "text": "Next steps\nNow that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format dataset formats docs.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/config.html",
-    "href": "docs/config.html",
-    "title": "Config Reference",
-    "section": "",
-    "text": "# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files\n# This can also be a relative path to a model on disk\nbase_model: ./llama-7b-hf\n# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)\nbase_model_ignore_patterns:\n# If the base_model repo on hf hub doesn't include configuration .json files,\n# You can set that here, or leave this empty to default to base_model\nbase_model_config: ./llama-7b-hf\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model:\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config:\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too\nmodel_type: AutoModelForCausalLM\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: AutoTokenizer\n# Trust remote code for untrusted source\ntrust_remote_code:\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast:\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy:\n# Resize the model embeddings when new tokens are added to multiples of 32\n# This is reported to improve training speed on some models\nresize_token_embeddings_to_32x:\n# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings:\n# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast:\n# Whether to load the model with randomly initialized weights. Useful for\n# pre-training a model from scratch or debugging purposes.\nrandom_init_weights:\n\n# (Internal use only)\n# Used to identify which the model is based on\nis_falcon_derived_model:\nis_llama_derived_model:\nis_qwen_derived_model:\n# Please note that if you set this to true, `padding_side` will be set to \"left\" by default\nis_mistral_derived_model:\n\n# optional overrides to the base model configuration\noverrides_of_model_config:\n  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653\n  rope_scaling:\n    type: # linear | dynamic\n    factor: # float\n\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs:\n  # use_cache: False\n\n# optional overrides to the bnb 4bit quantization configuration\n# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig\nbnb_config_kwargs:\n  # These are default values\n  llm_int8_has_fp16_weight: false\n  bnb_4bit_quant_type: nf4\n  bnb_4bit_use_double_quant: true\n\n\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: true\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: true\n# Use bitsandbytes 4 bit\nload_in_4bit:\n\n# Use CUDA bf16\nbf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require &gt;=ampere\n# Use CUDA fp16\nfp16: true\n# Use CUDA tf32\ntf32: true # require &gt;=ampere\n# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting\n\n# No AMP (automatic mixed precision)\nbfloat16: true # require &gt;=ampere\nfloat16: true\n\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset\ngpu_memory_limit: 20GiB\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: true\n\n# List[str]. Add plugins to extend the pipeline.\n# See `src/axolotl/integrations` for the available plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins:\n  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n# A list of one or more datasets to finetune the model with\ndatasets:\n  # HuggingFace dataset repo | s3://,gs:// path | \"json\" for local dataset, make sure to fill data_files\n  - path: vicgalle/alpaca-gpt4\n    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n    type: alpaca # format | format:&lt;prompt_style&gt; (chat/instruct) | &lt;prompt_strategies&gt;.load_&lt;load_fn&gt;\n    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file\n    data_files: # Optional[str] path to source data files\n\n    shards: # Optional[int] split dataset into N pieces (use with shards_idx)\n    shards_idx: # Optional[int] = 0 the index of sharded dataset to use\n\n    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)\n\n    name: # Optional[str] name of dataset configuration to load\n    split: train # Optional[str] name of dataset split to load from\n    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.\n    trust_remote_code: # Optional[bool] Trust remote code for untrusted source\n\n  # Custom user instruction prompt\n  - path: repo\n    type:\n      # The below are defaults. only set what's needed if you use a different column name.\n      system_prompt: \"\"\n      system_format: \"{system}\"\n      field_system: system\n      field_instruction: instruction\n      field_input: input\n      field_output: output\n\n      # Customizable to be single line or multi-line\n      # Use {instruction}/{input} as key to be replaced\n      # 'format' can include {input}\n      format: |-\n        User: {instruction} {input}\n        Assistant:\n      # 'no_input_format' cannot include {input}\n      no_input_format: \"{instruction} \"\n\n      # For `completion` datsets only, uses the provided field instead of `text` column\n      field:\n\n  # Using chat template\n  - path: ...\n    # Set type to `chat_template` to use this strategy\n    type: chat_template\n    # Specify the name of the chat template to use\n    # The name of the chat template to use for training, following values are supported:\n    # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.\n    # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n    # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.\n    # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n    chat_template: tokenizer_default\n\n    # Custom jinja chat template. Used only if `chat_template: jinja` or empty.\n    chat_template_jinja:\n\n    # Key containing the messages (default: \"messages\")\n    field_messages: messages\n\n    # Key containing the system message (default: \"system\")\n    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.\n    field_system: system\n\n    # Mapping of properties from the input dataset to the chat template.\n    # (default: message_property_mappings={'role':'role', 'content':'content'})\n    # If a property exists in the template but not in this mapping, the system will attempt\n    # to load it directly from the message using the property name as the key.\n    # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',\n    # while 'value' is loaded and used as 'content' in the chat template.\n    message_property_mappings:\n      role: from\n      content: value\n      # ...\n\n    # Optional[Dict[str, List]]. Roles mapping in the messages.\n    # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.\n    # The default is:\n    roles:\n      user: [\"human\", \"user\"]\n      assistant: [\"gpt\", \"assistant\"]\n      system: [\"system\"]\n      tool: [\"tool\"]\n\n    # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.\n    # This does not drop the default system message from chat_template if it exists. If you wish to,\n    # we recommend using a custom jinja template with the default system message removed or\n    # adding a system turn with empty content.\n    drop_system_message:\n\n    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags\n    # See example at `docs/dataset-formats/conversation.qmd`\n    split_thinking:\n\n    # IMPORTANT: The following fields determine which parts of the conversation to train on.\n    # Priority order: message_field_training &gt; message_field_training_detail &gt; train_on_inputs or role in roles_to_train\n    # See examples at `docs/dataset-formats/conversation.qmd`\n    # Note: If the below 5 fields are empty, defaults to training only on the last message.\n\n    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.\n    roles_to_train: [\"assistant\"]  # default\n    # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:\n    # - all: train on all EOS tokens\n    # - turn (default): train on the EOS token at the end of each trainable turn\n    # - last: train on the last EOS token in the conversation\n    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.\n    train_on_eos: turn\n    # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:\n    # - all: train on all EOT tokens\n    # - turn: train on the EOT token at the end of each trainable turn\n    # - last: train on the last EOT token in the conversation\n    # If not specified, defaults to the value of train_on_eos for backward compatibility.\n    train_on_eot:\n    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.\n    message_field_training: training\n    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.\n    # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).\n    message_field_training_detail: train_detail\n\n\n# If false, the datasets will not be shuffled and will keep their original order in `datasets`.\n# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: true\n\nDeduplicates datasets and test_datasets with identical entries.\ndataset_exact_deduplication: true\n\n# A list of one or more datasets to eval the model with.\n# You can use either test_datasets, or val_set_size, but not both.\ntest_datasets:\n  - path: /workspace/data/eval.jsonl\n    ds_type: json\n    # You need to specify a split. For \"json\" datasets the default split is called \"train\".\n    split: train\n    type: completion\n    data_files:\n      - /workspace/data/eval.jsonl\n\n# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl:\nrl_beta:  # Optional[float]. The beta parameter for the RL training.\n\n# dpo\ndpo_use_weighting:  # Optional[bool]. Whether to perform weighting.\nrpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.\n\n# orpo\norpo_alpha: 0.1  # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.\n\n# kto\nkto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.\nkto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.\n\n# simpo\ncpo_alpha: 1.0  # Weight of the BC regularizer\nsimpo_gamma: 0.5  # Target reward margin for the SimPO loss\n\n# grpo\ntrl:\n  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.\n  vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.\n  vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.\n  vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.\n\n  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use\n  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.\n\n  reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.\n  reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.\n\n  num_generations: # Optional[int]. Number of generations to sample.\n  log_completions: # Optional[bool]. Whether to log completions.\n\n  sync_ref_model: # Optional[bool]. Whether to sync the reference model.\n  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.\n  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.\n\n\n# reward modelling: `True` or `False`\nreward_model:\n\n# process reward modelling: `True` or `False`\nprocess_reward_model:\n\n# The name of the chat template to use for training, following values are supported:\n# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.\n# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py\n# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.\n# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.\n# The selected chat template will be saved to the tokenizer_config.json for easier inferencing\n# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.\nchat_template: tokenizer_default\n# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.\nchat_template_jinja: null\n# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.\n# These tokens mark the boundaries between conversation turns.\n# For example: [\"/INST\", \"&lt;/s&gt;\", \"[/SYSTEM_PROMPT]\"]\n# If not specified, defaults to just the model's eos_token.\n# This is useful for templates that use multiple delimiter tokens.\neot_tokens:\n  # - \"&lt;/s&gt;\"\n  # - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n# Changes the default system message\ndefault_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: data/last_run_prepared\n# Push prepared dataset to hub\npush_dataset_to_hub: # Optional[str] repo_org/repo_name\n# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`\n# if not set.\ndataset_processes: # defaults to os.cpu_count() if not set\n# Keep dataset in memory while preprocessing\n# Only needed if cached dataset is taking too much storage\ndataset_keep_in_memory:\n# push checkpoints to hub\nhub_model_id: # private repo path to push finetuned model\n# how to push checkpoints to hub\n# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy\nhub_strategy:\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets\n# Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: # boolean\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.\nval_set_size: 0.04\n# Num shards for whole dataset\ndataset_shard_num:\n# Index of shard to use for whole dataset\ndataset_shard_idx:\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: 2048\n# Pad inputs so each step uses constant sized buffers\n# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently\npad_to_sequence_len:\n# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'\nsample_packing:\n# Set to 'false' if getting errors during eval with sample_packing on.\neval_sample_packing:\n# You can set these packing optimizations AFTER starting a training at least once.\n# The trainer will provide recommended values for these values.\nsample_packing_eff_est:\ntotal_num_tokens:\n# Increasing the following values helps with packing, but usually only slightly (&lt;%1.)\n# The number of samples packed at a time.\nsample_packing_group_size: 100000\n# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.\nsample_packing_bin_size: 200\nsample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.\n\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation:\n\ncurriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening:\n\n# Passed through to transformers when loading the model when launched without accelerate\n# Use `sequential` when training w/ model parallelism to limit memory\ndevice_map:\n# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.\nmax_memory:\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model\nadapter: lora\n# If you already have a lora model trained that you want to load, put that here.\n# This means after training, if you want to test the model, you should set this to the value of `output_dir`.\n# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir:\n\n# LoRA hyperparameters\n# For more details about the following options, see:\n# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2\nlora_r: 8\nlora_alpha: 16\nlora_dropout: 0.05\nlora_target_modules:\n  - q_proj\n  - v_proj\n#  - k_proj\n#  - o_proj\n#  - gate_proj\n#  - down_proj\n#  - up_proj\nlora_target_linear: # If true, will target all linear modules\n\n# List[int] | int. # The layer indices to transform, otherwise, apply to all layers\n# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform\npeft_layers_to_transform:\n\n# Optional[bool]. Whether to use DoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora\npeft_use_dora:\n\n# Optional[bool]. Whether to use RSLoRA.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora\npeft_use_rslora:\n\n# Optional[list[tuple[int, int]]]. List of layer indices to replicate.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora\npeft_layer_replication:\n\n# bool | Literal[\"gaussian\", \"eva\", \"olora\", \"pissa\", \"pissa_niter_[number of iters]\", \"corda\", \"loftq\"]\n# How to initialize LoRA weights. Default to True which is MS original implementation.\n# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization\npeft_init_lora_weights:\n\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.\n# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.\n# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\n# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994\nlora_modules_to_save:\n#  - embed_tokens\n#  - lm_head\n\nlora_fan_in_fan_out: false\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for\n# speed and memory savings\n# See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n# LoRA+ hyperparameters\n# For more details about the following options, see:\n# https://arxiv.org/abs/2402.12354  and `src/axolotl/core/train_builder.py`\nloraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_embedding: #  loraplus learning rate for lora embedding layers. Default value is 1e-6.\n\npeft:\n  # Configuration options for loftq initialization for LoRA\n  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization\n  loftq_config:\n    loftq_bits:  # typically 4 bits\n\n# ReLoRA configuration\n# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed\nrelora_steps: # Number of steps per ReLoRA restart\nrelora_warmup_steps: # Number of per-restart warmup steps\nrelora_anneal_steps: # Number of anneal steps for each relora cycle\nrelora_prune_ratio: # threshold for optimizer magnitude when pruning\nrelora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings\n\n# wandb configuration if you're using it\n# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.\nwandb_mode: # \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn off wandb\nwandb_project: # Your wandb project name\nwandb_entity: # A wandb Team name if using a Team\nwandb_watch:\nwandb_name: # Set the name of your wandb run\nwandb_run_id: # Set the ID of your wandb run\nwandb_log_model: # \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only at the end of training\n\n# mlflow configuration if you're using it\nmlflow_tracking_uri: # URI to mlflow\nmlflow_experiment_name: # Your experiment name\nmlflow_run_name: # Your run name\nhf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry\n\n# Comet configuration if you're using it\n# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.\n# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start\nuse_comet: # Enable or disable Comet integration.\ncomet_api_key: # API key for Comet. Recommended to set via `comet login`.\ncomet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.\ncomet_project_name: # Project name in Comet. Defaults to Uncategorized.\ncomet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.\ncomet_mode: # Create a new experiment (\"create\") or log to an existing one (\"get\"). Default (\"get_or_create\") auto-selects based on configuration.\ncomet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.\ncomet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.\n\n# Tensorboard\nuse_tensorboard: # Optional[bool]\n\n# Where to save the full-finetuned model to\noutput_dir: ./completed-model\n\n# Whether to use torch.compile and which backend to use\n# setting to `auto` will enable torch compile when torch&gt;=2.5.1\ntorch_compile:  # Optional[Union[Literal[\"auto\"], bool]]\ntorch_compile_backend:  # Optional[str]\n\n# Training hyperparameters\n\n# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.\ngradient_accumulation_steps: 1\n# The number of samples to include in each batch. This is the number of samples sent to each GPU.\n# Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: 2\neval_batch_size:\nnum_epochs: 4\nwarmup_steps: 100  # cannot use with warmup_ratio\nwarmup_ratio: 0.05  # cannot use with warmup_steps\nlearning_rate: 0.00003\nlr_quadratic_warmup:\nlogging_steps:\neval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps\nevals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps\neval_strategy: # Set to `\"no\"` to skip evaluation, `\"epoch\"` at end of each epoch, leave empty to infer from `eval_steps`.\nsave_strategy: # Set to `\"no\"` to skip checkpoint saves, `\"epoch\"` at end of each epoch, `\"best\"` when better result is achieved, leave empty to infer from `save_steps`.\nsave_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps\nsaves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsave_total_limit: # Checkpoints saved at a time\nsave_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.\n# Maximum number of iterations to train for. It precedes num_epochs which means that\n# if both are set, num_epochs will not be guaranteed.\n# e.g., when 1 epoch is 1000 steps =&gt; `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps:\n\n# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.\ninclude_tokens_per_second: # Optional[bool]\n\n# whether to find batch size that fits in memory. Passed to underlying transformers Trainer\nauto_find_batch_size: # Optional[bool]\n\neval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0\neval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128\ndo_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.\neval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is [\"sacrebleu\", \"comet\", \"ter\", \"chrf\", \"perplexity\"]\n\nprofiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.\n                # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information\n                # snapshots can be visualized @ https://pytorch.org/memory_viz\n\nloss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)\nloss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)\n\n# Save model as safetensors (require safetensors package)\nsave_safetensors:\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: false\n# Group similarly sized data to minimize padding.\n# May be slower to start, as it must download and sort the entire dataset.\n# Note that training loss may have an oscillating pattern with this enabled.\ngroup_by_length: false\n\n# Whether to use gradient checkpointing. Available options are: true, false, \"offload\".\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: false\n# additional kwargs to pass to the trainer for gradient checkpointing\n# gradient_checkpointing_kwargs:\n#   use_reentrant: true\n\n# Stop training after this many evaluation losses have increased in a row\n# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback\nearly_stopping_patience: 3\n\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | 'linear' | 'cosine_with_restarts' | 'polynomial' | 'constant' | 'constant_with_warmup' | 'inverse_sqrt' | 'reduce_lr_on_plateau' | 'cosine_with_min_lr' | 'warmup_stable_decay' | empty for cosine\nlr_scheduler_kwargs:\ncosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr\ncosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)\n\n# For one_cycle optim\nlr_div_factor: # Learning rate div factor\n\n# Specify optimizer\n# Valid values are driven by the Transformers OptimizerNames class, see:\n# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189\n#\n# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of\n# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used\n# in the examples/ for your model and fine-tuning use case.\n#\n# Valid values for 'optimizer' include:\n# - adamw_torch\n# - adamw_torch_fused\n# - adamw_torch_xla\n# - adamw_torch_npu_fused\n# - adamw_apex_fused\n# - adopt_adamw  (an EXPERIMENTAL optimizer, only for torch version &gt;= 2.5.1)\n# - adafactor\n# - adamw_anyprecision\n# - adamw_torch_4bit\n# - ademamix\n# - sgd\n# - adagrad\n# - adamw_bnb_8bit\n# - adamw_8bit   # alias for adamw_bnb_8bit\n# - ademamix_8bit\n# - lion_8bit\n# - lion_32bit\n# - paged_adamw_32bit\n# - paged_adamw_8bit\n# - paged_ademamix_32bit\n# - paged_ademamix_8bit\n# - paged_lion_32bit\n# - paged_lion_8bit\n# - rmsprop\n# - rmsprop_bnb\n# - rmsprop_bnb_8bit\n# - rmsprop_bnb_32bit\n# - galore_adamw\n# - galore_adamw_8bit\n# - galore_adafactor\n# - galore_adamw_layerwise\n# - galore_adamw_8bit_layerwise\n# - galore_adafactor_layerwise\n# - lomo\n# - adalomo\n# - grokadamw\n# - schedule_free_adamw\n# - schedule_free_sgd\n# - apollo_adamw\n# - apollo_adamw_layerwise\n#\n# Additional custom optimizers include:\n# - optimi_adamw\n# - ao_adamw_8bit\n# - ao_adamw_fp8\n# - came_pytorch\noptimizer:\n# Dictionary of arguments to pass to the optimizer\noptim_args:\n# For Galore Optimizers the following optim_args are available\n# rank:  # type: int\n# update_proj_gap  # type: int\n# scale  # type: float\n# proj_type:  # type: str, default = std\n\n# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm\noptim_target_modules:\n# - self_attn  # for llama\n# - mlp\n\n# Specify weight decay\nweight_decay:\n# adamw hyperparams\nadam_beta1:\nadam_beta2:\nadam_epsilon:\n# Gradient clipping max norm\nmax_grad_norm:\n\n# Augmentation techniques\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings\n# currently only supported on Llama and Mistral\nneftune_noise_alpha:\n\n# Optional[bool]. Whether to bettertransformers\nflash_optimum:\n\n# Note: Only one of the following attention patches can be used at a time.\n# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.\n\n# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:\nxformers_attention:\n# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:\nflash_attention:\nflash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation\nflash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation\n# Optional[bool]. Whether to use scaled-dot-product attention\n# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention:\n# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention:\n\n# Optional[bool]. Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage:\n# Optional[str]. Resume from a specific checkpoint dir\nresume_from_checkpoint:\n# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: false\n\n## Multimodal section\n# int | tuple[int, int] | None . Size to resize images to, width x height.\n# Will read from model/processor config if not set.\nimage_size:\n# str. Algorithm to use for image resizing. \"bilinear\", \"bicubic\", \"lanczos\". Default is \"bilinear\".\nimage_resize_algorithm: 'bilinear'\n## End of multimodal section\n\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank:\n\n# Add or change special tokens.\n# If you add tokens here, you don't need to add them to the `tokens` list.\nspecial_tokens:\n  # bos_token: \"&lt;s&gt;\"\n  # eos_token: \"&lt;/s&gt;\"\n  # unk_token: \"&lt;unk&gt;\"\n  # pad_token: \"[PAD]\"\n\n# Optional[list[str]]. Add extra tokens to the tokenizer.\ntokens:\n  # - \"&lt;|startoftext|&gt;\"\n  # - \"&lt;|endoftext|&gt;\"\n\n# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.\n# Only works for tokens that are not part of the base vocab (aka are added_tokens).\n# Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides:  # Dict[int, str]\n#  128041: \"&lt;|im_start|&gt;\"\n#  128042: \"&lt;|im_end|&gt;\"\n\n# FSDP\nfsdp:\nfsdp_config:\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed:\n\n# Advanced DDP Arguments\nddp_timeout:\nddp_bucket_cap_mb:\nddp_broadcast_buffers:\n\n# Sequence parallelism\n# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.\n# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.\n# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized\n# subsequences, or set to 4 to split into four equal-sized subsequences.\n# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.\nsequence_parallel_degree:\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\n# Must evenly divide the number of KV heads in your model.\nheads_k_stride: 1\n# One of \"varlen_llama3\", \"batch_ring\", \"batch_zigzag\", \"batch_stripe\". Defaults to \"varlen_llama3\"\n# in the sample packing case, and \"batch_ring\" in the non-sample packing case.\nring_attn_func:\n\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path:\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset:\n\n# Debug mode\ndebug:\n\n# Seed\nseed:\n\n# Allow overwrite yml config using from cli\nstrict:",
-    "crumbs": [
-      "Getting Started",
-      "Config Reference"
-    ]
-  },
-  {
-    "objectID": "docs/reward_modelling.html",
-    "href": "docs/reward_modelling.html",
-    "title": "Reward Modelling",
-    "section": "",
-    "text": "Overview\nReward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions.\nWe support the reward modelling techniques supported by trl.\n\n\n(Outcome) Reward Models\nOutcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).\nbase_model: google/gemma-2-2b\nmodel_type: AutoModelForSequenceClassification\nnum_labels: 1\ntokenizer_type: AutoTokenizer\n\nreward_model: true\nchat_template: gemma\ndatasets:\n  - path: argilla/distilabel-intel-orca-dpo-pairs\n    type: bradley_terry.chat_template\n\nval_set_size: 0.1\neval_steps: 100\nBradley-Terry chat templates expect single-turn conversations in the following format:\n{\n    \"system\": \"...\", // optional\n    \"input\": \"...\",\n    \"chosen\": \"...\",\n    \"rejected\": \"...\"\n}\n\n\nProcess Reward Models (PRM)\n\n\n\n\n\n\nTip\n\n\n\nCheck out our PRM blog.\n\n\nProcess reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.\nbase_model: Qwen/Qwen2.5-3B\nmodel_type: AutoModelForTokenClassification\nnum_labels: 2\n\nprocess_reward_model: true\ndatasets:\n  - path: trl-lib/math_shepherd\n    type: stepwise_supervised\n    split: train\n\nval_set_size: 0.1\neval_steps: 100\nPlease see stepwise_supervised for more details on the dataset format.",
-    "crumbs": [
-      "How To Guides",
-      "Reward Modelling"
-    ]
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html",
-    "title": "prompt_strategies.dpo.chatml",
-    "section": "",
-    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
-  },
-  {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "title": "prompt_strategies.dpo.chatml",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
-  },
-  {
-    "objectID": "docs/api/kernels.geglu.html",
-    "href": "docs/api/kernels.geglu.html",
-    "title": "kernels.geglu",
-    "section": "",
-    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
-  },
-  {
-    "objectID": "docs/api/kernels.geglu.html#functions",
-    "href": "docs/api/kernels.geglu.html#functions",
-    "title": "kernels.geglu",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
-  },
-  {
-    "objectID": "docs/api/core.chat.format.shared.html",
-    "href": "docs/api/core.chat.format.shared.html",
-    "title": "core.chat.format.shared",
-    "section": "",
-    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
-  },
-  {
-    "objectID": "docs/api/cli.vllm_serve.html",
-    "href": "docs/api/cli.vllm_serve.html",
-    "title": "cli.vllm_serve",
-    "section": "",
-    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
-  },
-  {
-    "objectID": "docs/api/cli.vllm_serve.html#functions",
-    "href": "docs/api/cli.vllm_serve.html#functions",
-    "title": "cli.vllm_serve",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
-  },
-  {
-    "objectID": "docs/api/core.training_args.html",
-    "href": "docs/api/core.training_args.html",
-    "title": "core.training_args",
-    "section": "",
-    "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n    simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args."
-  },
-  {
-    "objectID": "docs/api/core.training_args.html#classes",
-    "href": "docs/api/core.training_args.html#classes",
-    "title": "core.training_args",
+    "objectID": "docs/api/core.trainers.utils.html",
+    "href": "docs/api/core.trainers.utils.html",
+    "title": "core.trainers.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\nAxolotlTrainingMixins\nMixin class for the Axolotl training args.\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n    simpo_gamma=None,\n)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin.\n\n\n\ncore.training_args.AxolotlTrainingMixins(\n    self,\n    model_type=None,\n    lr_quadratic_warmup=False,\n    pretraining=False,\n    sample_packing=False,\n    sample_packing_sequentially=False,\n    multipack_real_batches=False,\n    eval_sample_packing=None,\n    sample_packing_efficiency=1.0,\n    sample_packing_bin_size=200,\n    sample_packing_group_size=100000,\n    max_seq_length=2048,\n    relora_steps=None,\n    relora_warmup_steps=None,\n    relora_anneal_steps=None,\n    relora_prune_ratio=0.9,\n    bench_split='eval',\n    bench_dataset='pharaouk/dharma-1/dharma_1_mini.json',\n    do_bench_eval=False,\n    do_causal_lm_eval=False,\n    max_bench_samples=None,\n    bench_source_max_len=2048,\n    dataloader_prefetch_factor=None,\n    cosine_min_lr_ratio=None,\n    cosine_constant_lr_ratio=None,\n    loraplus_lr_ratio=None,\n    loraplus_lr_embedding=1e-06,\n    embedding_lr_scale=None,\n    lr_groups=None,\n    embedding_lr=None,\n    qlora=False,\n    orpo_alpha=None,\n    lisa_n_layers=None,\n    lisa_step_interval=None,\n    lisa_layers_attribute=None,\n    curriculum_sampling=None,\n    alternate_optimizer=None,\n    alternate_lr_scheduler_type=None,\n    chat_template=None,\n    kd_ce_alpha=None,\n    kd_alpha=1.0,\n    kd_temperature=1.0,\n    kd_zscore_base_temp=None,\n    kd_top_k_before_softmax=None,\n    sequence_parallel_degree=1,\n    ring_attn_func=None,\n    image_size=None,\n    image_resize_algorithm=None,\n)\nMixin class for the Axolotl training args."
+    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "title": "prompt_strategies.dpo.passthrough",
+    "objectID": "docs/api/core.chat.format.llama3x.html",
+    "href": "docs/api/core.chat.format.llama3x.html",
+    "title": "core.chat.format.llama3x",
     "section": "",
-    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
+    "text": "core.chat.format.llama3x\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents"
   },
   {
-    "objectID": "docs/api/monkeypatch.unsloth_.html",
-    "href": "docs/api/monkeypatch.unsloth_.html",
-    "title": "monkeypatch.unsloth_",
+    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "title": "prompt_strategies.alpaca_instruct",
     "section": "",
-    "text": "monkeypatch.unsloth_\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations"
+    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/cli.cloud.base.html",
+    "href": "docs/api/cli.cloud.base.html",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/cli.cloud.base.html#classes",
+    "href": "docs/api/cli.cloud.base.html#classes",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/utils.model_shard_quant.html",
-    "href": "docs/api/utils.model_shard_quant.html",
-    "title": "utils.model_shard_quant",
+    "objectID": "docs/api/kernels.quantize.html",
+    "href": "docs/api/kernels.quantize.html",
+    "title": "kernels.quantize",
     "section": "",
-    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
+    "text": "kernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
   },
   {
-    "objectID": "docs/api/utils.model_shard_quant.html#functions",
-    "href": "docs/api/utils.model_shard_quant.html#functions",
-    "title": "utils.model_shard_quant",
+    "objectID": "docs/api/kernels.quantize.html#functions",
+    "href": "docs/api/kernels.quantize.html#functions",
+    "title": "kernels.quantize",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
+    "text": "Name\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
   },
   {
-    "objectID": "docs/api/utils.chat_templates.html",
-    "href": "docs/api/utils.chat_templates.html",
-    "title": "utils.chat_templates",
+    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "title": "prompt_strategies.dpo.zephyr",
     "section": "",
-    "text": "utils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chat_template\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\nregister_chat_template\nRegisters chat templates.\n\n\n\n\n\nutils.chat_templates.get_chat_template(\n    user_choice,\n    jinja_template=None,\n    tokenizer=None,\n)\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nuser_choice\nstr\nThe user’s choice of template.\nrequired\n\n\njinja_template\nOptional[str]\nThe jinja template string. Defaults to None.\nNone\n\n\ntokenizer\nOptional[PreTrainedTokenizerBase]\nThe tokenizer. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nstr\nstr\nThe chosen template string.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the user_choice is not found in the templates.\n\n\n\n\n\n\n\nutils.chat_templates.register_chat_template(template_name, chat_template)\nRegisters chat templates.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntemplate_name\nstr\nThe name of the template.\nrequired\n\n\nchat_template\nstr\nThe template string.\nrequired"
+    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
   },
   {
-    "objectID": "docs/api/utils.chat_templates.html#functions",
-    "href": "docs/api/utils.chat_templates.html#functions",
-    "title": "utils.chat_templates",
+    "objectID": "docs/api/core.trainers.mixins.sequence_parallel.html",
+    "href": "docs/api/core.trainers.mixins.sequence_parallel.html",
+    "title": "core.trainers.mixins.sequence_parallel",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_chat_template\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\nregister_chat_template\nRegisters chat templates.\n\n\n\n\n\nutils.chat_templates.get_chat_template(\n    user_choice,\n    jinja_template=None,\n    tokenizer=None,\n)\nFinds the correct chat_template based on the user’s choice, jinja_template, and tokenizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nuser_choice\nstr\nThe user’s choice of template.\nrequired\n\n\njinja_template\nOptional[str]\nThe jinja template string. Defaults to None.\nNone\n\n\ntokenizer\nOptional[PreTrainedTokenizerBase]\nThe tokenizer. Defaults to None.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nstr\nstr\nThe chosen template string.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the user_choice is not found in the templates.\n\n\n\n\n\n\n\nutils.chat_templates.register_chat_template(template_name, chat_template)\nRegisters chat templates.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntemplate_name\nstr\nThe name of the template.\nrequired\n\n\nchat_template\nstr\nThe template string.\nrequired"
+    "text": "core.trainers.mixins.sequence_parallel\nModule for Axolotl trainer sequence parallelism mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSequenceParallelMixin\nMixin class for sequence parallelism support in trainers.\n\n\n\n\n\ncore.trainers.mixins.sequence_parallel.SequenceParallelMixin()\nMixin class for sequence parallelism support in trainers.\nThis mixin provides functionality for handling sequence parallelism,\nspecifically for creating appropriate data samplers."
   },
   {
-    "objectID": "docs/api/integrations.grokfast.optimizer.html",
-    "href": "docs/api/integrations.grokfast.optimizer.html",
-    "title": "integrations.grokfast.optimizer",
+    "objectID": "docs/api/core.trainers.mixins.sequence_parallel.html#classes",
+    "href": "docs/api/core.trainers.mixins.sequence_parallel.html#classes",
+    "title": "core.trainers.mixins.sequence_parallel",
     "section": "",
-    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
+    "text": "Name\nDescription\n\n\n\n\nSequenceParallelMixin\nMixin class for sequence parallelism support in trainers.\n\n\n\n\n\ncore.trainers.mixins.sequence_parallel.SequenceParallelMixin()\nMixin class for sequence parallelism support in trainers.\nThis mixin provides functionality for handling sequence parallelism,\nspecifically for creating appropriate data samplers."
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "title": "monkeypatch.llama_attn_hijack_flash",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html",
+    "href": "docs/api/prompt_strategies.pygmalion.html",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nFusedAttention\nFused QKV Attention layer for incrementally improved training efficiency\n\n\nLlamaDecoderLayer\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.FusedAttention(self, config, q, k, v, o)\nFused QKV Attention layer for incrementally improved training efficiency\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer()\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward\nInput shape: Batch x Time x Channel\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nattention_mask: [bsz, q_len]\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided\n\n\n\nmonkeypatch.llama_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
+    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(self, *args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#classes",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#classes",
-    "title": "monkeypatch.llama_attn_hijack_flash",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nFusedAttention\nFused QKV Attention layer for incrementally improved training efficiency\n\n\nLlamaDecoderLayer\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.FusedAttention(self, config, q, k, v, o)\nFused QKV Attention layer for incrementally improved training efficiency\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer()\npatched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.LlamaDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone"
+    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(self, *args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.llama_attn_hijack_flash",
+    "objectID": "docs/api/cli.checks.html",
+    "href": "docs/api/cli.checks.html",
+    "title": "cli.checks",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nflashattn_forward\nInput shape: Batch x Time x Channel\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nattention_mask: [bsz, q_len]\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided\n\n\n\nmonkeypatch.llama_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
+    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/integrations.liger.args.html",
-    "href": "docs/api/integrations.liger.args.html",
-    "title": "integrations.liger.args",
+    "objectID": "docs/api/cli.checks.html#functions",
+    "href": "docs/api/cli.checks.html#functions",
+    "title": "cli.checks",
     "section": "",
-    "text": "integrations.liger.args\nModule for handling LIGER input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
+    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/integrations.liger.args.html#classes",
-    "href": "docs/api/integrations.liger.args.html#classes",
-    "title": "integrations.liger.args",
+    "objectID": "docs/api/monkeypatch.attention.mllama.html",
+    "href": "docs/api/monkeypatch.attention.mllama.html",
+    "title": "monkeypatch.attention.mllama",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
+    "text": "monkeypatch.attention.mllama\nMonkeypatch for Vision Llama for FA2 support\n\n\n\n\n\nName\nDescription\n\n\n\n\nMllamaTextCrossFlashAttention2\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\n\n\nMllamaTextSelfFlashAttention2\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\n\n\n\n\n\nmonkeypatch.attention.mllama.MllamaTextCrossFlashAttention2(\n    self,\n    *args,\n    **kwargs,\n)\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\nimplements the forward pass using Flash Attention for improved performance.\n\n\n\nmonkeypatch.attention.mllama.MllamaTextSelfFlashAttention2(\n    self,\n    config,\n    layer_idx,\n    *args,\n    **kwargs,\n)\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\nimplements the forward pass using Flash Attention for improved performance."
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.optimizer.html",
-    "href": "docs/api/core.trainers.mixins.optimizer.html",
-    "title": "core.trainers.mixins.optimizer",
+    "objectID": "docs/api/monkeypatch.attention.mllama.html#classes",
+    "href": "docs/api/monkeypatch.attention.mllama.html#classes",
+    "title": "monkeypatch.attention.mllama",
     "section": "",
-    "text": "core.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
+    "text": "Name\nDescription\n\n\n\n\nMllamaTextCrossFlashAttention2\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\n\n\nMllamaTextSelfFlashAttention2\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\n\n\n\n\n\nmonkeypatch.attention.mllama.MllamaTextCrossFlashAttention2(\n    self,\n    *args,\n    **kwargs,\n)\nMllama flash cross-attention module. This module inherits from MllamaTextCrossAttention and\nimplements the forward pass using Flash Attention for improved performance.\n\n\n\nmonkeypatch.attention.mllama.MllamaTextSelfFlashAttention2(\n    self,\n    config,\n    layer_idx,\n    *args,\n    **kwargs,\n)\nMllama flash self-attention module. This module inherits from MllamaTextSelfAttention and\nimplements the forward pass using Flash Attention for improved performance."
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.optimizer.html#classes",
-    "href": "docs/api/core.trainers.mixins.optimizer.html#classes",
-    "title": "core.trainers.mixins.optimizer",
+    "objectID": "docs/api/cli.main.html",
+    "href": "docs/api/cli.main.html",
+    "title": "cli.main",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
+    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(config, accelerate, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(config, accelerate, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(config, accelerate, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(config, accelerate, cloud=None, sweep=None, **kwargs)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nOptional[str]\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/utils.models.html",
-    "href": "docs/api/utils.models.html",
-    "title": "utils.models",
+    "objectID": "docs/api/cli.main.html#functions",
+    "href": "docs/api/cli.main.html#functions",
+    "title": "cli.main",
     "section": "",
-    "text": "utils.models\nModule for models and model loading\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelLoader\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nutils.models.ModelLoader(\n    self,\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_llama_derived_model\nModify all llama derived models in one block\n\n\npatch_loss_llama\nPatch loss functions and other optimizations\n\n\nset_attention_config\nsample packing uses custom FA2 patch\n\n\nset_auto_model_loader\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n\n\n\n\n\nutils.models.ModelLoader.patch_llama_derived_model()\nModify all llama derived models in one block\n\n\n\nutils.models.ModelLoader.patch_loss_llama()\nPatch loss functions and other optimizations\n\n\n\nutils.models.ModelLoader.set_attention_config()\nsample packing uses custom FA2 patch\n\n\n\nutils.models.ModelLoader.set_auto_model_loader()\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n(set at __init__). When using a multimodal model, self.auto_model_loader\nshould be set according to the type of the model.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_module_class_from_name\nGets a class from a module by its name.\n\n\nload_model\nLoad a model for a given configuration and tokenizer.\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nutils.models.get_module_class_from_name(module, name)\nGets a class from a module by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodule\ntorch.nn.Module\nThe module to get the class from.\nrequired\n\n\nname\nstr\nThe name of the class.\nrequired\n\n\n\n\n\n\n\nutils.models.load_model(\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nLoad a model for a given configuration and tokenizer.\n\n\n\nutils.models.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nutils.models.modify_tokenizer_files(tokenizer_path, token_mappings, output_dir)\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\nDict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941\n\n\n\n\nutils.models.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nutils.models.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+    "text": "Name\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(config, accelerate, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(config, accelerate, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(config, accelerate, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(config, accelerate, cloud=None, sweep=None, **kwargs)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\naccelerate\nbool\nWhether to use accelerate launcher.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nOptional[str]\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/utils.models.html#classes",
-    "href": "docs/api/utils.models.html#classes",
-    "title": "utils.models",
+    "objectID": "docs/api/cli.utils.html",
+    "href": "docs/api/cli.utils.html",
+    "title": "cli.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModelLoader\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nutils.models.ModelLoader(\n    self,\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nModelLoader: managing all the config and monkey patches while loading model\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_llama_derived_model\nModify all llama derived models in one block\n\n\npatch_loss_llama\nPatch loss functions and other optimizations\n\n\nset_attention_config\nsample packing uses custom FA2 patch\n\n\nset_auto_model_loader\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n\n\n\n\n\nutils.models.ModelLoader.patch_llama_derived_model()\nModify all llama derived models in one block\n\n\n\nutils.models.ModelLoader.patch_loss_llama()\nPatch loss functions and other optimizations\n\n\n\nutils.models.ModelLoader.set_attention_config()\nsample packing uses custom FA2 patch\n\n\n\nutils.models.ModelLoader.set_auto_model_loader()\nSet self.auto_model_loader. Defaults to transformers.AutoModelForCausalLM\n(set at __init__). When using a multimodal model, self.auto_model_loader\nshould be set according to the type of the model."
+    "text": "cli.utils\nUtility methods for axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ndownload_file\nDownload a single file and return its processing status.\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\n\n\nstrip_optional_type\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\ncli.utils.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.download_file(file_info, raw_base_url, dest_path, dir_prefix)\nDownload a single file and return its processing status.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfile_info\ntuple\nTuple of (file_path, remote_sha).\nrequired\n\n\nraw_base_url\nstr\nBase URL for raw GitHub content.\nrequired\n\n\ndest_path\nPath\nLocal destination directory.\nrequired\n\n\ndir_prefix\nstr\nDirectory prefix to filter files.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[str, str]\nTuple of (file_path, status) where status is ‘new’, ‘updated’, or ‘unchanged’.\n\n\n\n\n\n\n\ncli.utils.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5\n\n\n\n\n\n\n\ncli.utils.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function.\n\n\n\n\n\n\n\ncli.utils.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\nconfig.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).\n\n\n\n\n\n\n\ncli.utils.strip_optional_type(field_type)\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield_type\ntype | str | None\nType of field for Axolotl CLI command.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nIf the input type is Union[T, None] or Optional[T], returns T. Otherwise returns the input type unchanged."
   },
   {
-    "objectID": "docs/api/utils.models.html#functions",
-    "href": "docs/api/utils.models.html#functions",
-    "title": "utils.models",
+    "objectID": "docs/api/cli.utils.html#functions",
+    "href": "docs/api/cli.utils.html#functions",
+    "title": "cli.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_module_class_from_name\nGets a class from a module by its name.\n\n\nload_model\nLoad a model for a given configuration and tokenizer.\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nutils.models.get_module_class_from_name(module, name)\nGets a class from a module by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodule\ntorch.nn.Module\nThe module to get the class from.\nrequired\n\n\nname\nstr\nThe name of the class.\nrequired\n\n\n\n\n\n\n\nutils.models.load_model(\n    cfg,\n    tokenizer,\n    *,\n    processor=None,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nLoad a model for a given configuration and tokenizer.\n\n\n\nutils.models.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nutils.models.modify_tokenizer_files(tokenizer_path, token_mappings, output_dir)\nModify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\nDict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941\n\n\n\n\nutils.models.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nutils.models.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+    "text": "Name\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ndownload_file\nDownload a single file and return its processing status.\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\n\n\nstrip_optional_type\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\ncli.utils.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.download_file(file_info, raw_base_url, dest_path, dir_prefix)\nDownload a single file and return its processing status.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfile_info\ntuple\nTuple of (file_path, remote_sha).\nrequired\n\n\nraw_base_url\nstr\nBase URL for raw GitHub content.\nrequired\n\n\ndest_path\nPath\nLocal destination directory.\nrequired\n\n\ndir_prefix\nstr\nDirectory prefix to filter files.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[str, str]\nTuple of (file_path, status) where status is ‘new’, ‘updated’, or ‘unchanged’.\n\n\n\n\n\n\n\ncli.utils.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5\n\n\n\n\n\n\n\ncli.utils.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function.\n\n\n\n\n\n\n\ncli.utils.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the given axolotl\nconfig.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).\n\n\n\n\n\n\n\ncli.utils.strip_optional_type(field_type)\nExtracts the non-None type from an Optional / Union type.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfield_type\ntype | str | None\nType of field for Axolotl CLI command.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nIf the input type is Union[T, None] or Optional[T], returns T. Otherwise returns the input type unchanged."
   },
   {
-    "objectID": "docs/api/core.datasets.chat.html",
-    "href": "docs/api/core.datasets.chat.html",
-    "title": "core.datasets.chat",
+    "objectID": "docs/api/utils.tokenization.html",
+    "href": "docs/api/utils.tokenization.html",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    self,\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/core.datasets.chat.html#classes",
-    "href": "docs/api/core.datasets.chat.html#classes",
-    "title": "core.datasets.chat",
+    "objectID": "docs/api/utils.tokenization.html#functions",
+    "href": "docs/api/utils.tokenization.html#functions",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    self,\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
-    "href": "docs/api/prompt_strategies.llama2_chat.html",
-    "title": "prompt_strategies.llama2_chat",
+    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "title": "monkeypatch.btlm_attn_hijack_flash",
     "section": "",
-    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(\n    self,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    self,\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
   },
   {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "title": "prompt_strategies.llama2_chat",
+    "objectID": "docs/api/prompt_strategies.completion.html",
+    "href": "docs/api/prompt_strategies.completion.html",
+    "title": "prompt_strategies.completion",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(\n    self,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    self,\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+    "text": "prompt_strategies.completion\nBasic completion text\n\n\n\n\n\nName\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    self,\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
   },
   {
-    "objectID": "docs/api/prompt_strategies.messages.chat.html",
-    "href": "docs/api/prompt_strategies.messages.chat.html",
-    "title": "prompt_strategies.messages.chat",
+    "objectID": "docs/api/prompt_strategies.completion.html#classes",
+    "href": "docs/api/prompt_strategies.completion.html#classes",
+    "title": "prompt_strategies.completion",
     "section": "",
-    "text": "prompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    self,\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
+    "text": "Name\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    self,\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
   },
   {
-    "objectID": "docs/api/prompt_strategies.messages.chat.html#classes",
-    "href": "docs/api/prompt_strategies.messages.chat.html#classes",
-    "title": "prompt_strategies.messages.chat",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
+    "href": "docs/api/prompt_strategies.kto.chatml.html",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    self,\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
+    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/utils.dict.html",
-    "href": "docs/api/utils.dict.html",
-    "title": "utils.dict",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/utils.dict.html#classes",
-    "href": "docs/api/utils.dict.html#classes",
-    "title": "utils.dict",
+    "objectID": "docs/api/core.trainer_builder.html",
+    "href": "docs/api/core.trainer_builder.html",
+    "title": "core.trainer_builder",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
+    "text": "core.trainer_builder\nBuilder for the training args and trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\nHFPPOTrainerBuilder\nHF Factory class for PPO Trainer\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.trainer_builder.HFCausalTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL.\n\n\n\ncore.trainer_builder.HFPPOTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nHF Factory class for PPO Trainer\n\n\n\ncore.trainer_builder.HFRLTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\ncore.trainer_builder.TrainerBuilderBase(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.trainer_builder.TrainerBuilderBase.get_post_trainer_create_callbacks(\n    trainer,\n)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
   },
   {
-    "objectID": "docs/api/cli.config.html",
-    "href": "docs/api/cli.config.html",
-    "title": "cli.config",
+    "objectID": "docs/api/core.trainer_builder.html#classes",
+    "href": "docs/api/core.trainer_builder.html#classes",
+    "title": "core.trainer_builder",
     "section": "",
-    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\nHFPPOTrainerBuilder\nHF Factory class for PPO Trainer\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.trainer_builder.HFCausalTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL.\n\n\n\ncore.trainer_builder.HFPPOTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nHF Factory class for PPO Trainer\n\n\n\ncore.trainer_builder.HFRLTrainerBuilder(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\ncore.trainer_builder.TrainerBuilderBase(\n    self,\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.trainer_builder.TrainerBuilderBase.get_post_trainer_create_callbacks(\n    trainer,\n)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
   },
   {
-    "objectID": "docs/api/cli.config.html#functions",
-    "href": "docs/api/cli.config.html#functions",
-    "title": "cli.config",
+    "objectID": "docs/api/core.chat.format.chatml.html",
+    "href": "docs/api/core.chat.format.chatml.html",
+    "title": "core.chat.format.chatml",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "core.chat.format.chatml\ncore.chat.format.chatml\nChatML transformation functions for MessageContents"
   },
   {
-    "objectID": "docs/api/core.trainers.trl.html",
-    "href": "docs/api/core.trainers.trl.html",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/cli.merge_lora.html",
+    "href": "docs/api/cli.merge_lora.html",
+    "title": "cli.merge_lora",
     "section": "",
-    "text": "core.trainers.trl\nModule for TRL PPO trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\nTRLPPOTrainer\nWrapper for TRL PPO trainer to handle customizations\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer()\nExtend the base CPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer()\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer()\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer()\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer()\nExtend the base RewardTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.TRLPPOTrainer()\nWrapper for TRL PPO trainer to handle customizations"
+    "text": "cli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/core.trainers.trl.html#classes",
-    "href": "docs/api/core.trainers.trl.html#classes",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/cli.merge_lora.html#functions",
+    "href": "docs/api/cli.merge_lora.html#functions",
+    "title": "cli.merge_lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\nTRLPPOTrainer\nWrapper for TRL PPO trainer to handle customizations\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer()\nExtend the base CPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the CPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer()\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer()\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_batch_loss_metrics\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer.get_batch_loss_metrics(\n    model,\n    batch,\n    train_eval='train',\n)\nCompute the ORPO loss and other metrics for the given batch of inputs for train or test.\n\n\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer()\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer()\nExtend the base RewardTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.TRLPPOTrainer()\nWrapper for TRL PPO trainer to handle customizations"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.data.pretraining.html",
-    "href": "docs/api/utils.data.pretraining.html",
-    "title": "utils.data.pretraining",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "title": "monkeypatch.llama_attn_hijack_xformers",
     "section": "",
-    "text": "utils.data.pretraining\nutils.data.pretraining\ndata handling specific to pretraining"
+    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
-    "href": "docs/api/prompt_strategies.kto.user_defined.html",
-    "title": "prompt_strategies.kto.user_defined",
+    "objectID": "docs/api/utils.collators.core.html",
+    "href": "docs/api/utils.collators.core.html",
+    "title": "utils.collators.core",
     "section": "",
-    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
+    "text": "utils.collators.core\nutils.collators.core\nbasic shared collator constants"
   },
   {
-    "objectID": "docs/api/utils.schedulers.html",
-    "href": "docs/api/utils.schedulers.html",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/prompt_tokenizers.html",
+    "href": "docs/api/prompt_tokenizers.html",
+    "title": "prompt_tokenizers",
     "section": "",
-    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    self,\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.RexLR(\n    self,\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "prompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#classes",
-    "href": "docs/api/utils.schedulers.html#classes",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/prompt_tokenizers.html#classes",
+    "href": "docs/api/prompt_tokenizers.html#classes",
+    "title": "prompt_tokenizers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    self,\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.RexLR(\n    self,\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
+    "text": "Name\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts."
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#functions",
-    "href": "docs/api/utils.schedulers.html#functions",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/prompt_tokenizers.html#functions",
+    "href": "docs/api/prompt_tokenizers.html#functions",
+    "title": "prompt_tokenizers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "Name\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
   },
   {
-    "objectID": "docs/api/utils.freeze.html",
-    "href": "docs/api/utils.freeze.html",
-    "title": "utils.freeze",
+    "objectID": "docs/api/cli.preprocess.html",
+    "href": "docs/api/cli.preprocess.html",
+    "title": "cli.preprocess",
     "section": "",
-    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(self, pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
+    "text": "cli.preprocess\nCLI to run preprocessing of a dataset.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.freeze.html#classes",
-    "href": "docs/api/utils.freeze.html#classes",
-    "title": "utils.freeze",
+    "objectID": "docs/api/cli.preprocess.html#functions",
+    "href": "docs/api/cli.preprocess.html#functions",
+    "title": "cli.preprocess",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(self, pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.freeze.html#functions",
-    "href": "docs/api/utils.freeze.html#functions",
-    "title": "utils.freeze",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
+    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html",
-    "href": "docs/api/kernels.swiglu.html",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html#functions",
-    "href": "docs/api/kernels.swiglu.html#functions",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "title": "cli.merge_sharded_fsdp_weights",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
   },
   {
-    "objectID": "docs/api/integrations.base.html",
-    "href": "docs/api/integrations.base.html",
-    "title": "integrations.base",
+    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "title": "monkeypatch.data.batch_dataset_fetcher",
     "section": "",
-    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\nintegrations.base.BasePlugin(self)\nBase class for all plugins. Defines the interface for plugin methods.\nAttributes:\nNone\nMethods:\nregister(cfg): Registers the plugin with the given configuration.\nload_datasets(cfg): Loads and preprocesses the dataset for training.\npre_model_load(cfg): Performs actions before the model is loaded.\npost_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.\npre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\npost_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\npost_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.\npost_trainer_create(cfg, trainer): Performs actions after the trainer is created.\ncreate_optimizer(cfg, trainer): Creates and returns an optimizer for training.\ncreate_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.\nadd_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.\nadd_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer.\n\n\nadd_callbacks_pre_trainer\nsetup callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer.\nThis is useful for callbacks that require access to the model or trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nsetup callbacks before creating the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added to the TrainingArgs\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\noptimizer\nobject\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\nLRScheduler\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the trainer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe axolotl configuration\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins.\nIt should be a singleton so it can be accessed from anywhere in the codebase.\nAttributes:\nplugins (ListBasePlugin): A list of loaded plugins.\nMethods:\nget_instance(): Static method to get the singleton instance of PluginManager.\nregister(plugin_name: str): Registers a new plugin by its name.\npre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager.\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model has been loaded\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\nParameters:\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler, or None if none was found.\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\nParameters:\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\nReturns:\nlist[str]: A list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager.\nIf the instance doesn’t exist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The trainer class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nThe configuration for the plugins.\nrequired\n\n\npreprocess\n\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\nbut before any adapters have been applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugins.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model has been loaded\ninclusive of any adapters\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\nParameters:\nplugin_name (str): The name of the plugin to be registered.\nReturns:\nNone\nRaises:\nImportError: If the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”.\nThis function splits the plugin name into module and class, imports the module,\nretrieves the class from the module, and creates an instance of the class.\nParameters:\nplugin_name (str): The name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nReturns:\nBasePlugin: An instance of the loaded plugin.\nRaises:\nImportError: If the plugin module cannot be imported."
+    "text": "monkeypatch.data.batch_dataset_fetcher\nmonkeypatch.data.batch_dataset_fetcher\nmonkey patches for the dataset fetcher to handle batches of packed indexes"
   },
   {
-    "objectID": "docs/api/integrations.base.html#classes",
-    "href": "docs/api/integrations.base.html#classes",
-    "title": "integrations.base",
+    "objectID": "docs/api/utils.schemas.integrations.html",
+    "href": "docs/api/utils.schemas.integrations.html",
+    "title": "utils.schemas.integrations",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\nintegrations.base.BasePlugin(self)\nBase class for all plugins. Defines the interface for plugin methods.\nAttributes:\nNone\nMethods:\nregister(cfg): Registers the plugin with the given configuration.\nload_datasets(cfg): Loads and preprocesses the dataset for training.\npre_model_load(cfg): Performs actions before the model is loaded.\npost_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.\npre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\npost_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\npost_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.\npost_trainer_create(cfg, trainer): Performs actions after the trainer is created.\ncreate_optimizer(cfg, trainer): Creates and returns an optimizer for training.\ncreate_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.\nadd_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.\nadd_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer.\n\n\nadd_callbacks_pre_trainer\nsetup callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer.\nThis is useful for callbacks that require access to the model or trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nsetup callbacks before creating the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nList[callable]: A list of callback functions to be added to the TrainingArgs\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\noptimizer\nobject\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\nLRScheduler\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the trainer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe axolotl configuration\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nobject\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nNone\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration.\nParameters:\ncfg (dict): The configuration for the plugin.\nReturns:\nNone\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins.\nIt should be a singleton so it can be accessed from anywhere in the codebase.\nAttributes:\nplugins (ListBasePlugin): A list of loaded plugins.\nMethods:\nget_instance(): Static method to get the singleton instance of PluginManager.\nregister(plugin_name: str): Registers a new plugin by its name.\npre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager.\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model has been loaded\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nList[callable]: A list of callback functions to be added to the TrainingArgs.\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.\nParameters:\ntrainer (object): The trainer object for training.\noptimizer (object): The optimizer for training.\nReturns:\nobject: The created learning rate scheduler, or None if none was found.\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns the first non-None optimizer.\nParameters:\ntrainer (object): The trainer object for training.\nReturns:\nobject: The created optimizer, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\nReturns:\nlist[str]: A list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager.\nIf the instance doesn’t exist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The trainer class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nThe configuration for the plugins.\nrequired\n\n\npreprocess\n\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\n\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the model has been built/loaded,\nbut before any adapters have been applied.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugins.\nrequired\n\n\nmodel\nobject\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model has been loaded\ninclusive of any adapters\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\ntrainer (object): The trainer object for training.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nmodel (object): The loaded model.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nNone\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\nParameters:\nplugin_name (str): The name of the plugin to be registered.\nReturns:\nNone\nRaises:\nImportError: If the plugin module cannot be imported."
+    "text": "utils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\n\n\n\nName\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
   },
   {
-    "objectID": "docs/api/integrations.base.html#functions",
-    "href": "docs/api/integrations.base.html#functions",
-    "title": "integrations.base",
+    "objectID": "docs/api/utils.schemas.integrations.html#classes",
+    "href": "docs/api/utils.schemas.integrations.html#classes",
+    "title": "utils.schemas.integrations",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”.\nThis function splits the plugin name into module and class, imports the module,\nretrieves the class from the module, and creates an instance of the class.\nParameters:\nplugin_name (str): The name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nReturns:\nBasePlugin: An instance of the loaded plugin.\nRaises:\nImportError: If the plugin module cannot be imported."
+    "text": "Name\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
   },
   {
-    "objectID": "docs/api/cli.cloud.modal_.html",
-    "href": "docs/api/cli.cloud.modal_.html",
-    "title": "cli.cloud.modal_",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(self, config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
+    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/cli.cloud.modal_.html#classes",
-    "href": "docs/api/cli.cloud.modal_.html#classes",
-    "title": "cli.cloud.modal_",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(self, config, app=None)\nModal Cloud implementation."
+    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/cli.cloud.modal_.html#functions",
-    "href": "docs/api/cli.cloud.modal_.html#functions",
-    "title": "cli.cloud.modal_",
+    "objectID": "docs/api/logging_config.html",
+    "href": "docs/api/logging_config.html",
+    "title": "logging_config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
+    "text": "logging_config\nCommon logging module for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html",
-    "href": "docs/api/core.trainers.grpo.trainer.html",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/logging_config.html#classes",
+    "href": "docs/api/logging_config.html#classes",
+    "title": "logging_config",
     "section": "",
-    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    self,\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer()\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/logging_config.html#functions",
+    "href": "docs/api/logging_config.html#functions",
+    "title": "logging_config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    self,\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer()\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
   },
   {
-    "objectID": "docs/api/cli.sweeps.html",
-    "href": "docs/api/cli.sweeps.html",
-    "title": "cli.sweeps",
+    "objectID": "docs/api/utils.callbacks.perplexity.html",
+    "href": "docs/api/utils.callbacks.perplexity.html",
+    "title": "utils.callbacks.perplexity",
     "section": "",
-    "text": "cli.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, list]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
+    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(self, tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
   },
   {
-    "objectID": "docs/api/cli.sweeps.html#functions",
-    "href": "docs/api/cli.sweeps.html#functions",
-    "title": "cli.sweeps",
+    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
+    "href": "docs/api/utils.callbacks.perplexity.html#classes",
+    "title": "utils.callbacks.perplexity",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, list]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
+    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(self, tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html",
-    "href": "docs/api/prompt_strategies.input_output.html",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    self,\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
-    "href": "docs/api/prompt_strategies.input_output.html#classes",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    self,\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/utils.gradient_checkpointing.unsloth.html",
-    "href": "docs/api/utils.gradient_checkpointing.unsloth.html",
-    "title": "utils.gradient_checkpointing.unsloth",
+    "objectID": "docs/api/monkeypatch.multipack.html",
+    "href": "docs/api/monkeypatch.multipack.html",
+    "title": "monkeypatch.multipack",
     "section": "",
-    "text": "utils.gradient_checkpointing.unsloth\nUnsloth checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nUnsloth_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nutils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer()\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
   },
   {
-    "objectID": "docs/api/utils.gradient_checkpointing.unsloth.html#classes",
-    "href": "docs/api/utils.gradient_checkpointing.unsloth.html#classes",
-    "title": "utils.gradient_checkpointing.unsloth",
+    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "title": "prompt_strategies.dpo.user_defined",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nUnsloth_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nutils.gradient_checkpointing.unsloth.Unsloth_Offloaded_Gradient_Checkpointer()\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
   },
   {
     "objectID": "docs/api/cli.args.html",
     "href": "docs/api/cli.args.html",
     "title": "cli.args",
     "section": "",
-    "text": "cli.args\nModule for axolotl CLI command arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(self, prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.TrainerCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    merge_lora=False,\n    prompter=None,\n    shard=False,\n    main_process_port=None,\n    num_processes=None,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    self,\n    tensor_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+    "text": "cli.args\nModule for axolotl CLI command arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(self, prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.TrainerCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    merge_lora=False,\n    prompter=None,\n    shard=False,\n    main_process_port=None,\n    num_processes=None,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    self,\n    tensor_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
   },
   {
     "objectID": "docs/api/cli.args.html#classes",
     "href": "docs/api/cli.args.html#classes",
     "title": "cli.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(self, prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.TrainerCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    merge_lora=False,\n    prompter=None,\n    shard=False,\n    main_process_port=None,\n    num_processes=None,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    self,\n    tensor_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+    "text": "Name\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(self, prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.TrainerCliArgs(\n    self,\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    merge_lora=False,\n    prompter=None,\n    shard=False,\n    main_process_port=None,\n    num_processes=None,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    self,\n    tensor_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
   },
   {
     "objectID": "docs/api/cli.inference.html",
@@ -3394,95 +3408,95 @@
     "text": "Name\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
   },
   {
-    "objectID": "docs/api/integrations.kd.trainer.html",
-    "href": "docs/api/integrations.kd.trainer.html",
-    "title": "integrations.kd.trainer",
+    "objectID": "docs/api/kernels.utils.html",
+    "href": "docs/api/kernels.utils.html",
+    "title": "kernels.utils",
     "section": "",
-    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(\n    self,\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
   },
   {
-    "objectID": "docs/api/integrations.kd.trainer.html#classes",
-    "href": "docs/api/integrations.kd.trainer.html#classes",
-    "title": "integrations.kd.trainer",
+    "objectID": "docs/api/prompt_strategies.kto.llama3.html",
+    "href": "docs/api/prompt_strategies.kto.llama3.html",
+    "title": "prompt_strategies.kto.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(\n    self,\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+    "text": "prompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html",
-    "href": "docs/api/prompt_strategies.chat_template.html",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/prompt_strategies.kto.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.kto.llama3.html#functions",
+    "title": "prompt_strategies.kto.llama3",
     "section": "",
-    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    self,\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    roles=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(turns, turn_idx)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
-    "href": "docs/api/prompt_strategies.chat_template.html#classes",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    self,\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    roles=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(turns, turn_idx)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html",
-    "href": "docs/api/integrations.lm_eval.args.html",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    self,\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    self,\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
-    "href": "docs/api/integrations.lm_eval.args.html#classes",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/utils.collators.mm_chat.html",
+    "href": "docs/api/utils.collators.mm_chat.html",
+    "title": "utils.collators.mm_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    self,\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
   },
   {
-    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "title": "monkeypatch.mistral_attn_hijack_flash",
+    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
+    "href": "docs/api/utils.collators.mm_chat.html#classes",
+    "title": "utils.collators.mm_chat",
     "section": "",
-    "text": "monkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\n\n\n\nName\nDescription\n\n\n\n\nMistralDecoderLayer\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer()\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
+    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    self,\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
   },
   {
-    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#classes",
-    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#classes",
-    "title": "monkeypatch.mistral_attn_hijack_flash",
+    "objectID": "docs/api/utils.bench.html",
+    "href": "docs/api/utils.bench.html",
+    "title": "utils.bench",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMistralDecoderLayer\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer()\npatched version of MistralDecoderLayer to pass through the precalculated cu_seqlens\n\n\n\n\n\nName\nDescription\n\n\n\n\nforward\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.MistralDecoderLayer.forward(\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nhidden_states\ntorch.FloatTensor\ninput to the layer of shape (batch, seq_len, embed_dim)\nrequired\n\n\nattention_mask\ntorch.FloatTensor, optional\nattention mask of size (batch, 1, tgt_len, src_len) where padding elements are indicated by very large negative values.\nNone\n\n\noutput_attentions\nbool, optional\nWhether or not to return the attentions tensors of all attention layers. See attentions under returned tensors for more detail.\nFalse\n\n\nuse_cache\nbool, optional\nIf set to True, past_key_values key value states are returned and can be used to speed up decoding (see past_key_values).\nFalse\n\n\npast_key_value\nTuple(torch.FloatTensor), optional\ncached past key and value projection states\nNone"
+    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.mistral_attn_hijack_flash",
+    "objectID": "docs/api/utils.bench.html#functions",
+    "href": "docs/api/utils.bench.html#functions",
+    "title": "utils.bench",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngenerate_qkv\n\n\n\n\n\n\nmonkeypatch.mistral_attn_hijack_flash.generate_qkv(\n    q,\n    k,\n    v,\n    query_padding_mask=None,\n    key_padding_mask=None,\n    kvpacked=False,\n    qkvpacked=False,\n)\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nq\n\n(batch_size, seqlen_q, nheads, d)\nrequired\n\n\nk\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nv\n\n(batch_size, seqlen_k, nheads_k, d)\nrequired\n\n\nquery_padding_mask\n\n(batch_size, seqlen), bool\nNone\n\n\nkey_padding_mask\n\n(batch_size, seqlen), bool\nNone"
+    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/core.chat.messages.html",
-    "href": "docs/api/core.chat.messages.html",
-    "title": "core.chat.messages",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html",
+    "href": "docs/api/utils.callbacks.mlflow_.html",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "core.chat.messages\ninternal message representations of chat messages\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
+    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(\n    self,\n    axolotl_config_path,\n)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/core.chat.messages.html#classes",
-    "href": "docs/api/core.chat.messages.html#classes",
-    "title": "core.chat.messages",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatFormattedChats\nChat formatted chats with formatter and optional train on inputs\n\n\nChats\ntop level data structure for chat conversations\n\n\nMessageContentTypes\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\nMessageContents\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\nMessageRoles\nMessage roles for the system, user, assistant, and tools\n\n\nMessages\nMessages with role, content, metadata, weight, and chat formatting\n\n\nPreferenceChats\nrepresentation for preference data for chat\n\n\nSpecialToken\nSpecial tokens for beginning of string and end of string\n\n\nTool\nTool with description, function, and parameters\n\n\nToolCallContents\nTool call contents with name, arguments, and optional id\n\n\nToolCallFunction\nTool call function with name and arguments\n\n\nToolResponseContents\nTool response contents with name, content, and optional id\n\n\n\n\n\ncore.chat.messages.ChatFormattedChats()\nChat formatted chats with formatter and optional train on inputs\n\n\n\ncore.chat.messages.Chats()\ntop level data structure for chat conversations\n\n\n\ncore.chat.messages.MessageContentTypes()\nMessage content types for text, image, audio, tool calls, and tool responses\n\n\n\ncore.chat.messages.MessageContents()\nMessage contents with type, value, metadata, weight, newline, and end of contents\n\n\n\ncore.chat.messages.MessageRoles()\nMessage roles for the system, user, assistant, and tools\n\n\n\ncore.chat.messages.Messages()\nMessages with role, content, metadata, weight, and chat formatting\n\n\n\ncore.chat.messages.PreferenceChats()\nrepresentation for preference data for chat\n\n\n\ncore.chat.messages.SpecialToken()\nSpecial tokens for beginning of string and end of string\n\n\n\ncore.chat.messages.Tool()\nTool with description, function, and parameters\n\n\n\ncore.chat.messages.ToolCallContents()\nTool call contents with name, arguments, and optional id\n\n\n\ncore.chat.messages.ToolCallFunction()\nTool call function with name and arguments\n\n\n\ncore.chat.messages.ToolResponseContents()\nTool response contents with name, content, and optional id"
+    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(\n    self,\n    axolotl_config_path,\n)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/utils.lora_embeddings.html",
-    "href": "docs/api/utils.lora_embeddings.html",
-    "title": "utils.lora_embeddings",
+    "objectID": "docs/api/utils.callbacks.profiler.html",
+    "href": "docs/api/utils.callbacks.profiler.html",
+    "title": "utils.callbacks.profiler",
     "section": "",
-    "text": "utils.lora_embeddings\nhelpers for lora embeddings\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_linear_embedding_layers\nreturns the linear embedding layers needed for loras, dependent on the model arch\n\n\n\n\n\nutils.lora_embeddings.get_linear_embedding_layers(model_type)\nreturns the linear embedding layers needed for loras, dependent on the model arch"
+    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(self, steps_to_profile=5)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
   },
   {
-    "objectID": "docs/api/utils.lora_embeddings.html#functions",
-    "href": "docs/api/utils.lora_embeddings.html#functions",
-    "title": "utils.lora_embeddings",
+    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
+    "href": "docs/api/utils.callbacks.profiler.html#classes",
+    "title": "utils.callbacks.profiler",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_linear_embedding_layers\nreturns the linear embedding layers needed for loras, dependent on the model arch\n\n\n\n\n\nutils.lora_embeddings.get_linear_embedding_layers(model_type)\nreturns the linear embedding layers needed for loras, dependent on the model arch"
+    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(self, steps_to_profile=5)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
   },
   {
     "objectID": "docs/api/utils.schemas.trl.html",
diff --git a/sitemap.xml b/sitemap.xml
index a4a5722d5..eecad3e33 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,718 +2,722 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2025-05-12T21:52:48.677Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.788Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2025-05-12T21:52:48.699Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.810Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/TODO.html</loc>
-    <lastmod>2025-05-12T21:52:48.677Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.788Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2025-05-12T21:53:19.552Z</lastmod>
+    <lastmod>2025-05-13T20:40:24.496Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2025-05-12T21:53:20.147Z</lastmod>
+    <lastmod>2025-05-13T20:40:25.114Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2025-05-12T21:53:20.115Z</lastmod>
+    <lastmod>2025-05-13T20:40:25.057Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2025-05-12T21:53:20.382Z</lastmod>
+    <lastmod>2025-05-13T20:40:25.347Z</lastmod>
   </url>
   <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2025-05-12T21:53:20.442Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2025-05-12T21:53:20.447Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2025-05-12T21:53:20.047Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2025-05-12T21:53:20.390Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2025-05-12T21:53:19.633Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2025-05-12T21:53:19.726Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2025-05-12T21:53:19.870Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2025-05-12T21:53:19.972Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2025-05-12T21:53:19.559Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2025-05-12T21:53:19.008Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2025-05-12T21:53:19.523Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2025-05-12T21:53:19.079Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2025-05-12T21:53:20.213Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2025-05-12T21:53:19.487Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2025-05-12T21:53:19.841Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2025-05-12T21:53:20.344Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2025-05-12T21:53:20.152Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2025-05-12T21:53:19.619Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2025-05-12T21:53:19.588Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2025-05-12T21:53:19.963Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2025-05-12T21:53:20.386Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2025-05-12T21:53:19.331Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2025-05-12T21:53:20.126Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2025-05-12T21:53:19.920Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2025-05-12T21:53:19.954Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2025-05-12T21:53:19.755Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2025-05-12T21:53:20.140Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2025-05-12T21:53:19.100Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2025-05-12T21:53:20.431Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2025-05-12T21:53:20.451Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2025-05-12T21:53:20.034Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2025-05-12T21:53:20.178Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2025-05-12T21:53:20.187Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2025-05-12T21:53:19.716Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2025-05-12T21:53:19.913Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2025-05-12T21:53:19.703Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2025-05-12T21:53:20.439Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2025-05-12T21:53:19.146Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2025-05-12T21:53:20.322Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2025-05-12T21:53:20.199Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2025-05-12T21:53:19.982Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2025-05-12T21:53:19.414Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2025-05-12T21:53:19.422Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2025-05-12T21:53:19.141Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2025-05-12T21:53:20.363Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2025-05-12T21:53:19.897Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2025-05-12T21:53:19.402Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2025-05-12T21:53:19.275Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainer_builder.html</loc>
-    <lastmod>2025-05-12T21:53:19.161Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2025-05-12T21:53:19.734Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2025-05-12T21:53:19.660Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2025-05-12T21:53:19.956Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2025-05-12T21:53:20.019Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2025-05-12T21:53:19.459Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2025-05-12T21:53:19.323Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.attention.mllama.html</loc>
-    <lastmod>2025-05-12T21:53:19.981Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2025-05-12T21:53:19.363Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2025-05-12T21:53:19.687Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.sequence_parallel.html</loc>
-    <lastmod>2025-05-12T21:53:19.562Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2025-05-12T21:53:19.715Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2025-05-12T21:53:19.869Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2025-05-12T21:53:19.467Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2025-05-12T21:53:19.621Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2025-05-12T21:53:19.277Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2025-05-12T21:53:19.546Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2025-05-12T21:52:48.683Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2025-05-12T21:52:48.678Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2025-05-12T21:52:48.683Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2025-05-12T21:52:48.678Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2025-05-12T21:52:48.680Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2025-05-12T21:52:48.683Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2025-05-12T21:52:48.683Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/config.html</loc>
-    <lastmod>2025-05-12T21:52:48.679Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2025-05-12T21:52:48.683Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2025-05-12T21:53:19.713Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2025-05-12T21:53:19.851Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2025-05-12T21:53:19.278Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2025-05-12T21:53:19.464Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2025-05-12T21:53:19.251Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2025-05-12T21:53:19.718Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2025-05-12T21:53:19.974Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2025-05-12T21:53:19.966Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2025-05-12T21:53:20.043Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2025-05-12T21:53:20.029Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2025-05-12T21:53:20.323Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2025-05-12T21:53:19.896Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2025-05-12T21:53:20.334Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2025-05-12T21:53:19.549Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.models.html</loc>
-    <lastmod>2025-05-12T21:53:20.012Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2025-05-12T21:53:19.283Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2025-05-12T21:53:19.654Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2025-05-12T21:53:19.692Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2025-05-12T21:53:20.119Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2025-05-12T21:53:19.380Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2025-05-12T21:53:19.505Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.pretraining.html</loc>
-    <lastmod>2025-05-12T21:53:20.128Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2025-05-12T21:53:19.735Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2025-05-12T21:53:20.096Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2025-05-12T21:53:20.054Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2025-05-12T21:53:19.861Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2025-05-12T21:53:20.319Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2025-05-12T21:53:19.473Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2025-05-12T21:53:19.532Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.sweeps.html</loc>
-    <lastmod>2025-05-12T21:53:19.428Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2025-05-12T21:53:19.665Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.gradient_checkpointing.unsloth.html</loc>
-    <lastmod>2025-05-12T21:53:20.132Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2025-05-12T21:53:19.356Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2025-05-12T21:53:19.394Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2025-05-12T21:53:19.947Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2025-05-12T21:53:19.677Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2025-05-12T21:53:19.693Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.relora.html</loc>
-    <lastmod>2025-05-12T21:53:19.514Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2025-05-12T21:53:19.069Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2025-05-12T21:53:19.680Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2025-05-12T21:53:19.984Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2025-05-12T21:53:20.071Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2025-05-12T21:53:20.341Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2025-05-12T21:53:19.759Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
-    <lastmod>2025-05-12T21:53:19.957Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2025-05-12T21:53:20.444Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2025-05-12T21:53:19.509Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2025-05-12T21:53:19.086Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2025-05-12T21:53:20.207Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2025-05-12T21:53:20.362Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
-    <lastmod>2025-05-12T21:53:19.922Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2025-05-12T21:53:19.340Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2025-05-12T21:53:20.129Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2025-05-12T21:53:19.641Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2025-05-12T21:53:20.170Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2025-05-12T21:53:20.361Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2025-05-12T21:53:19.544Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2025-05-12T21:53:19.586Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2025-05-12T21:53:20.331Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2025-05-12T21:53:19.606Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2025-05-12T21:53:20.338Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2025-05-12T21:53:19.912Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2025-05-12T21:53:19.274Z</lastmod>
+    <loc>https://docs.axolotl.ai/docs/api/utils.gradient_checkpointing.offload_disk.html</loc>
+    <lastmod>2025-05-13T20:40:25.099Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.lora_embeddings.html</loc>
-    <lastmod>2025-05-12T21:53:20.038Z</lastmod>
+    <lastmod>2025-05-13T20:40:24.981Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
+    <lastmod>2025-05-13T20:40:24.217Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
+    <lastmod>2025-05-13T20:40:24.855Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
+    <lastmod>2025-05-13T20:40:25.303Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
+    <lastmod>2025-05-13T20:40:24.549Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
+    <lastmod>2025-05-13T20:40:25.296Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.gradient_checkpointing.offload_cpu.html</loc>
+    <lastmod>2025-05-13T20:40:25.074Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
+    <lastmod>2025-05-13T20:40:24.916Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
+    <lastmod>2025-05-13T20:40:24.502Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
+    <lastmod>2025-05-13T20:40:23.954Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
+    <lastmod>2025-05-13T20:40:24.466Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
+    <lastmod>2025-05-13T20:40:24.025Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
+    <lastmod>2025-05-13T20:40:25.179Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
+    <lastmod>2025-05-13T20:40:24.431Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
+    <lastmod>2025-05-13T20:40:24.785Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
+    <lastmod>2025-05-13T20:40:25.309Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
+    <lastmod>2025-05-13T20:40:25.119Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
+    <lastmod>2025-05-13T20:40:24.563Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
+    <lastmod>2025-05-13T20:40:24.531Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
+    <lastmod>2025-05-13T20:40:24.907Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
+    <lastmod>2025-05-13T20:40:25.351Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
+    <lastmod>2025-05-13T20:40:24.274Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
+    <lastmod>2025-05-13T20:40:25.068Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
+    <lastmod>2025-05-13T20:40:24.864Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
+    <lastmod>2025-05-13T20:40:24.898Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
+    <lastmod>2025-05-13T20:40:24.699Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
+    <lastmod>2025-05-13T20:40:25.107Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
+    <lastmod>2025-05-13T20:40:24.046Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
+    <lastmod>2025-05-13T20:40:25.396Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
+    <lastmod>2025-05-13T20:40:25.415Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
+    <lastmod>2025-05-13T20:40:24.977Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
+    <lastmod>2025-05-13T20:40:25.145Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
+    <lastmod>2025-05-13T20:40:25.153Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
+    <lastmod>2025-05-13T20:40:24.609Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.sweeps.html</loc>
+    <lastmod>2025-05-13T20:40:24.371Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
+    <lastmod>2025-05-13T20:40:24.475Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
+    <lastmod>2025-05-13T20:40:24.417Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
+    <lastmod>2025-05-13T20:40:25.284Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
+    <lastmod>2025-05-13T20:40:24.805Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
+    <lastmod>2025-05-13T20:40:24.997Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
+    <lastmod>2025-05-13T20:40:25.038Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
+    <lastmod>2025-05-13T20:40:24.679Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.pretraining.html</loc>
+    <lastmod>2025-05-13T20:40:25.069Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
+    <lastmod>2025-05-13T20:40:24.448Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
+    <lastmod>2025-05-13T20:40:24.323Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
+    <lastmod>2025-05-13T20:40:25.060Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
+    <lastmod>2025-05-13T20:40:24.635Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
+    <lastmod>2025-05-13T20:40:24.597Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
+    <lastmod>2025-05-13T20:40:24.227Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.models.html</loc>
+    <lastmod>2025-05-13T20:40:24.956Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
+    <lastmod>2025-05-13T20:40:24.492Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
+    <lastmod>2025-05-13T20:40:25.300Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
+    <lastmod>2025-05-13T20:40:24.840Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
+    <lastmod>2025-05-13T20:40:25.289Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
+    <lastmod>2025-05-13T20:40:24.972Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
+    <lastmod>2025-05-13T20:40:24.986Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
+    <lastmod>2025-05-13T20:40:24.910Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
+    <lastmod>2025-05-13T20:40:24.918Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
+    <lastmod>2025-05-13T20:40:24.661Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
+    <lastmod>2025-05-13T20:40:24.195Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
+    <lastmod>2025-05-13T20:40:24.407Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
+    <lastmod>2025-05-13T20:40:24.221Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
+    <lastmod>2025-05-13T20:40:24.795Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
+    <lastmod>2025-05-13T20:40:24.657Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/config.html</loc>
+    <lastmod>2025-05-13T20:39:49.789Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/installation.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/docker.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
+    <lastmod>2025-05-13T20:39:49.789Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
+    <lastmod>2025-05-13T20:39:49.794Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
+    <lastmod>2025-05-13T20:39:49.789Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
+    <lastmod>2025-05-13T20:39:49.789Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/faq.html</loc>
+    <lastmod>2025-05-13T20:39:49.790Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
+    <lastmod>2025-05-13T20:40:24.489Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
+    <lastmod>2025-05-13T20:40:24.220Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
+    <lastmod>2025-05-13T20:40:24.564Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
+    <lastmod>2025-05-13T20:40:24.410Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
+    <lastmod>2025-05-13T20:40:24.813Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
+    <lastmod>2025-05-13T20:40:24.658Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.sequence_parallel.html</loc>
+    <lastmod>2025-05-13T20:40:24.505Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
+    <lastmod>2025-05-13T20:40:24.631Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
+    <lastmod>2025-05-13T20:40:24.305Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.attention.mllama.html</loc>
+    <lastmod>2025-05-13T20:40:24.924Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
+    <lastmod>2025-05-13T20:40:24.265Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
+    <lastmod>2025-05-13T20:40:24.403Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
+    <lastmod>2025-05-13T20:40:24.963Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
+    <lastmod>2025-05-13T20:40:24.900Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
+    <lastmod>2025-05-13T20:40:24.603Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
+    <lastmod>2025-05-13T20:40:24.677Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainer_builder.html</loc>
+    <lastmod>2025-05-13T20:40:24.108Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
+    <lastmod>2025-05-13T20:40:24.218Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
+    <lastmod>2025-05-13T20:40:24.345Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
+    <lastmod>2025-05-13T20:40:24.841Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
+    <lastmod>2025-05-13T20:40:25.328Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
+    <lastmod>2025-05-13T20:40:24.087Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
+    <lastmod>2025-05-13T20:40:24.365Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
+    <lastmod>2025-05-13T20:40:24.357Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
+    <lastmod>2025-05-13T20:40:24.926Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
+    <lastmod>2025-05-13T20:40:25.165Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
+    <lastmod>2025-05-13T20:40:25.288Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
+    <lastmod>2025-05-13T20:40:24.092Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
+    <lastmod>2025-05-13T20:40:25.402Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
+    <lastmod>2025-05-13T20:40:24.647Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
+    <lastmod>2025-05-13T20:40:24.857Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
+    <lastmod>2025-05-13T20:40:24.660Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
+    <lastmod>2025-05-13T20:40:24.299Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
+    <lastmod>2025-05-13T20:40:24.337Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
+    <lastmod>2025-05-13T20:40:24.890Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
+    <lastmod>2025-05-13T20:40:24.620Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
+    <lastmod>2025-05-13T20:40:24.637Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.relora.html</loc>
+    <lastmod>2025-05-13T20:40:24.457Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
+    <lastmod>2025-05-13T20:40:24.015Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
+    <lastmod>2025-05-13T20:40:24.624Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
+    <lastmod>2025-05-13T20:40:24.927Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
+    <lastmod>2025-05-13T20:40:25.014Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
+    <lastmod>2025-05-13T20:40:25.306Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
+    <lastmod>2025-05-13T20:40:24.703Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
+    <lastmod>2025-05-13T20:40:24.901Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
+    <lastmod>2025-05-13T20:40:25.407Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
+    <lastmod>2025-05-13T20:40:24.453Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
+    <lastmod>2025-05-13T20:40:24.032Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
+    <lastmod>2025-05-13T20:40:25.174Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
+    <lastmod>2025-05-13T20:40:25.327Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
+    <lastmod>2025-05-13T20:40:24.865Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
+    <lastmod>2025-05-13T20:40:24.282Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
+    <lastmod>2025-05-13T20:40:25.071Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
+    <lastmod>2025-05-13T20:40:24.584Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
+    <lastmod>2025-05-13T20:40:25.136Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
+    <lastmod>2025-05-13T20:40:25.326Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
+    <lastmod>2025-05-13T20:40:24.488Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
+    <lastmod>2025-05-13T20:40:24.530Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
+    <lastmod>2025-05-13T20:40:24.814Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
+    <lastmod>2025-05-13T20:40:24.669Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
+    <lastmod>2025-05-13T20:40:24.576Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
+    <lastmod>2025-05-13T20:40:25.355Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
+    <lastmod>2025-05-13T20:40:24.990Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
+    <lastmod>2025-05-13T20:40:25.411Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
+    <lastmod>2025-05-13T20:40:25.406Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2025-05-12T21:53:20.182Z</lastmod>
+    <lastmod>2025-05-13T20:40:25.148Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2025-05-12T21:53:20.342Z</lastmod>
+    <lastmod>2025-05-13T20:40:25.308Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2025-05-12T21:53:19.291Z</lastmod>
+    <lastmod>2025-05-13T20:40:24.234Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2025-05-12T21:53:19.670Z</lastmod>
+    <lastmod>2025-05-13T20:40:24.613Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2025-05-12T21:52:48.683Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2025-05-12T21:52:48.682Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2025-05-12T21:52:48.678Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.789Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2025-05-12T21:52:48.683Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.793Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2025-05-12T21:52:48.699Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.810Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2025-05-12T21:52:48.695Z</lastmod>
+    <lastmod>2025-05-13T20:39:49.806Z</lastmod>
   </url>
 </urlset>