diff --git a/.nojekyll b/.nojekyll
index f4b6a59e4..b6837083c 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-f4b50a99
\ No newline at end of file
+c8f78714
\ No newline at end of file
diff --git a/FAQS.html b/FAQS.html
index eadebd0da..3aa04ba1c 100644
--- a/FAQS.html
+++ b/FAQS.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/amd_hpc.html b/docs/amd_hpc.html
index df612eaea..a5cf8f4af 100644
--- a/docs/amd_hpc.html
+++ b/docs/amd_hpc.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.args.html b/docs/api/cli.args.html
index 85ce9ff80..aacef5d59 100644
--- a/docs/api/cli.args.html
+++ b/docs/api/cli.args.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -574,7 +580,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a>    debug_num_examples<span class="op">=</span><span class="dv">1</span>,</span>
 <span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a>    prompter<span class="op">=</span><span class="va">None</span>,</span>
 <span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a>    download<span class="op">=</span><span class="va">True</span>,</span>
-<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    iterable<span class="op">=</span><span class="va">None</span>,</span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a>    iterable<span class="op">=</span><span class="va">False</span>,</span>
 <span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Dataclass with CLI arguments for <code>axolotl preprocess</code> command.</p>
 </section>
diff --git a/docs/api/cli.art.html b/docs/api/cli.art.html
index 60e380a42..a3cc4d4ea 100644
--- a/docs/api/cli.art.html
+++ b/docs/api/cli.art.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.checks.html b/docs/api/cli.checks.html
index e8dd0bd08..8cbd053e8 100644
--- a/docs/api/cli.checks.html
+++ b/docs/api/cli.checks.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.cloud.base.html b/docs/api/cli.cloud.base.html
index df96d8e82..ac62ffa2d 100644
--- a/docs/api/cli.cloud.base.html
+++ b/docs/api/cli.cloud.base.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.cloud.modal_.html b/docs/api/cli.cloud.modal_.html
index c4aff2958..3f140029c 100644
--- a/docs/api/cli.cloud.modal_.html
+++ b/docs/api/cli.cloud.modal_.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.config.html b/docs/api/cli.config.html
index 96901a31c..f6c3f1bf3 100644
--- a/docs/api/cli.config.html
+++ b/docs/api/cli.config.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.delinearize_llama4.html b/docs/api/cli.delinearize_llama4.html
index 60431cd93..d0a207c14 100644
--- a/docs/api/cli.delinearize_llama4.html
+++ b/docs/api/cli.delinearize_llama4.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.evaluate.html b/docs/api/cli.evaluate.html
index 9cc7b8cad..619f2f476 100644
--- a/docs/api/cli.evaluate.html
+++ b/docs/api/cli.evaluate.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.inference.html b/docs/api/cli.inference.html
index e2aa14c75..482b0a65f 100644
--- a/docs/api/cli.inference.html
+++ b/docs/api/cli.inference.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.main.html b/docs/api/cli.main.html
index 79fa3dacb..485c28541 100644
--- a/docs/api/cli.main.html
+++ b/docs/api/cli.main.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.merge_lora.html b/docs/api/cli.merge_lora.html
index b095dbd5d..7cfac1e37 100644
--- a/docs/api/cli.merge_lora.html
+++ b/docs/api/cli.merge_lora.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.merge_sharded_fsdp_weights.html b/docs/api/cli.merge_sharded_fsdp_weights.html
index aaa294d4e..194b2c9a7 100644
--- a/docs/api/cli.merge_sharded_fsdp_weights.html
+++ b/docs/api/cli.merge_sharded_fsdp_weights.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.preprocess.html b/docs/api/cli.preprocess.html
index c6afd588d..cb28d9b9e 100644
--- a/docs/api/cli.preprocess.html
+++ b/docs/api/cli.preprocess.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.quantize.html b/docs/api/cli.quantize.html
index e44e66258..cab2b955c 100644
--- a/docs/api/cli.quantize.html
+++ b/docs/api/cli.quantize.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.train.html b/docs/api/cli.train.html
index 23a2d5646..0afaf24f7 100644
--- a/docs/api/cli.train.html
+++ b/docs/api/cli.train.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.args.html b/docs/api/cli.utils.args.html
index 6dacc0469..6788982b8 100644
--- a/docs/api/cli.utils.args.html
+++ b/docs/api/cli.utils.args.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.fetch.html b/docs/api/cli.utils.fetch.html
index 92a7fe0a2..81e8cb946 100644
--- a/docs/api/cli.utils.fetch.html
+++ b/docs/api/cli.utils.fetch.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.html b/docs/api/cli.utils.html
index 0b902e7b5..e45d44382 100644
--- a/docs/api/cli.utils.html
+++ b/docs/api/cli.utils.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.load.html b/docs/api/cli.utils.load.html
index 16253771b..db0e9822b 100644
--- a/docs/api/cli.utils.load.html
+++ b/docs/api/cli.utils.load.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.sweeps.html b/docs/api/cli.utils.sweeps.html
index 4812dc146..083f9223a 100644
--- a/docs/api/cli.utils.sweeps.html
+++ b/docs/api/cli.utils.sweeps.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.utils.train.html b/docs/api/cli.utils.train.html
index 2c418a27d..6d177c8fc 100644
--- a/docs/api/cli.utils.train.html
+++ b/docs/api/cli.utils.train.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/cli.vllm_serve.html b/docs/api/cli.vllm_serve.html
index e8f423f5e..a1ce112be 100644
--- a/docs/api/cli.vllm_serve.html
+++ b/docs/api/cli.vllm_serve.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/common.architectures.html b/docs/api/common.architectures.html
index 5f9e8c9ff..d6175f2f4 100644
--- a/docs/api/common.architectures.html
+++ b/docs/api/common.architectures.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/common.const.html b/docs/api/common.const.html
index 48cabd8f0..98e2c457d 100644
--- a/docs/api/common.const.html
+++ b/docs/api/common.const.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/common.datasets.html b/docs/api/common.datasets.html
index 8a0c98b53..531ad1af2 100644
--- a/docs/api/common.datasets.html
+++ b/docs/api/common.datasets.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/convert.html b/docs/api/convert.html
index 982be884b..35d77fe60 100644
--- a/docs/api/convert.html
+++ b/docs/api/convert.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.builders.base.html b/docs/api/core.builders.base.html
index 95e629f3f..0d66bb65d 100644
--- a/docs/api/core.builders.base.html
+++ b/docs/api/core.builders.base.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.builders.causal.html b/docs/api/core.builders.causal.html
index 6ada4c76b..8cb8cd227 100644
--- a/docs/api/core.builders.causal.html
+++ b/docs/api/core.builders.causal.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.builders.rl.html b/docs/api/core.builders.rl.html
index 77ea38b9a..cb4eda417 100644
--- a/docs/api/core.builders.rl.html
+++ b/docs/api/core.builders.rl.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.format.chatml.html b/docs/api/core.chat.format.chatml.html
index 6f9c17890..cb1d53282 100644
--- a/docs/api/core.chat.format.chatml.html
+++ b/docs/api/core.chat.format.chatml.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.format.llama3x.html b/docs/api/core.chat.format.llama3x.html
index 3f68898fb..23e4b683c 100644
--- a/docs/api/core.chat.format.llama3x.html
+++ b/docs/api/core.chat.format.llama3x.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.format.shared.html b/docs/api/core.chat.format.shared.html
index 322d5fec6..6045b2432 100644
--- a/docs/api/core.chat.format.shared.html
+++ b/docs/api/core.chat.format.shared.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.chat.messages.html b/docs/api/core.chat.messages.html
index b7a0c7d49..bcab8159f 100644
--- a/docs/api/core.chat.messages.html
+++ b/docs/api/core.chat.messages.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.datasets.chat.html b/docs/api/core.datasets.chat.html
index f75c1979b..dbf6386eb 100644
--- a/docs/api/core.datasets.chat.html
+++ b/docs/api/core.datasets.chat.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.datasets.transforms.chat_builder.html b/docs/api/core.datasets.transforms.chat_builder.html
index 2f5053718..2943162ff 100644
--- a/docs/api/core.datasets.transforms.chat_builder.html
+++ b/docs/api/core.datasets.transforms.chat_builder.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.base.html b/docs/api/core.trainers.base.html
index 06df54840..2c75fb0e2 100644
--- a/docs/api/core.trainers.base.html
+++ b/docs/api/core.trainers.base.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.dpo.trainer.html b/docs/api/core.trainers.dpo.trainer.html
index 853b42098..a0bfe2d83 100644
--- a/docs/api/core.trainers.dpo.trainer.html
+++ b/docs/api/core.trainers.dpo.trainer.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.grpo.sampler.html b/docs/api/core.trainers.grpo.sampler.html
index 70f179b34..2af55ba1c 100644
--- a/docs/api/core.trainers.grpo.sampler.html
+++ b/docs/api/core.trainers.grpo.sampler.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.grpo.trainer.html b/docs/api/core.trainers.grpo.trainer.html
index 5f9db1b2b..fbdf38cbf 100644
--- a/docs/api/core.trainers.grpo.trainer.html
+++ b/docs/api/core.trainers.grpo.trainer.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mamba.html b/docs/api/core.trainers.mamba.html
index 97f61f0c1..cb10a8da5 100644
--- a/docs/api/core.trainers.mamba.html
+++ b/docs/api/core.trainers.mamba.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mixins.optimizer.html b/docs/api/core.trainers.mixins.optimizer.html
index 1626586fa..0504ee16d 100644
--- a/docs/api/core.trainers.mixins.optimizer.html
+++ b/docs/api/core.trainers.mixins.optimizer.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mixins.rng_state_loader.html b/docs/api/core.trainers.mixins.rng_state_loader.html
index fba7ca88d..89ffcb7ea 100644
--- a/docs/api/core.trainers.mixins.rng_state_loader.html
+++ b/docs/api/core.trainers.mixins.rng_state_loader.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.mixins.scheduler.html b/docs/api/core.trainers.mixins.scheduler.html
index 547f04d92..5cc2c56b6 100644
--- a/docs/api/core.trainers.mixins.scheduler.html
+++ b/docs/api/core.trainers.mixins.scheduler.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.trl.html b/docs/api/core.trainers.trl.html
index d1ffe1fdb..7e2a9cc36 100644
--- a/docs/api/core.trainers.trl.html
+++ b/docs/api/core.trainers.trl.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.trainers.utils.html b/docs/api/core.trainers.utils.html
index c1bb11e49..c86d378ac 100644
--- a/docs/api/core.trainers.utils.html
+++ b/docs/api/core.trainers.utils.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/core.training_args.html b/docs/api/core.training_args.html
index d2027610c..76c20d70f 100644
--- a/docs/api/core.training_args.html
+++ b/docs/api/core.training_args.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/datasets.html b/docs/api/datasets.html
index 9b6951a6d..3d04284d6 100644
--- a/docs/api/datasets.html
+++ b/docs/api/datasets.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -495,7 +501,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <ul class="collapse">
   <li><a href="#classes" id="toc-classes" class="nav-link" data-scroll-target="#classes">Classes</a>
   <ul class="collapse">
-  <li><a href="#axolotl.datasets.ConstantLengthDataset" id="toc-axolotl.datasets.ConstantLengthDataset" class="nav-link" data-scroll-target="#axolotl.datasets.ConstantLengthDataset">ConstantLengthDataset</a></li>
   <li><a href="#axolotl.datasets.TokenizedPromptDataset" id="toc-axolotl.datasets.TokenizedPromptDataset" class="nav-link" data-scroll-target="#axolotl.datasets.TokenizedPromptDataset">TokenizedPromptDataset</a></li>
   </ul></li>
   </ul></li>
@@ -511,7 +516,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <section id="axolotl.datasets" class="level1">
 <h1>datasets</h1>
 <p><code>datasets</code></p>
-<p>Module containing Dataset functionality</p>
+<p>Module containing dataset functionality.</p>
+<p>We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
+concept of middlewares to wrap each dataset. We’ll use the collators later on to pad the
+datasets.</p>
 <section id="classes" class="level2">
 <h2 class="anchored" data-anchor-id="classes">Classes</h2>
 <table class="caption-top table">
@@ -523,72 +531,23 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </thead>
 <tbody>
 <tr class="odd">
-<td><a href="#axolotl.datasets.ConstantLengthDataset">ConstantLengthDataset</a></td>
-<td>Iterable dataset that returns constant length chunks of tokens from stream of</td>
-</tr>
-<tr class="even">
 <td><a href="#axolotl.datasets.TokenizedPromptDataset">TokenizedPromptDataset</a></td>
 <td>Dataset that returns tokenized prompts from a stream of text files.</td>
 </tr>
 </tbody>
 </table>
-<section id="axolotl.datasets.ConstantLengthDataset" class="level3">
-<h3 class="anchored" data-anchor-id="axolotl.datasets.ConstantLengthDataset">ConstantLengthDataset</h3>
-<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>datasets.ConstantLengthDataset(tokenizer, datasets, seq_length<span class="op">=</span><span class="dv">2048</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
-<p>Iterable dataset that returns constant length chunks of tokens from stream of
-text files.</p>
-<section id="parameters" class="level4 doc-section doc-section-parameters">
-<h4 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters">Parameters</h4>
-<table class="caption-top table">
-<colgroup>
-<col style="width: 15%">
-<col style="width: 10%">
-<col style="width: 58%">
-<col style="width: 15%">
-</colgroup>
-<thead>
-<tr class="header">
-<th>Name</th>
-<th>Type</th>
-<th>Description</th>
-<th>Default</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td>tokenizer</td>
-<td></td>
-<td>The processor used for processing the data.</td>
-<td><em>required</em></td>
-</tr>
-<tr class="even">
-<td>dataset</td>
-<td></td>
-<td>Dataset with text files.</td>
-<td><em>required</em></td>
-</tr>
-<tr class="odd">
-<td>seq_length</td>
-<td></td>
-<td>Length of token sequences to return.</td>
-<td><code>2048</code></td>
-</tr>
-</tbody>
-</table>
-</section>
-</section>
 <section id="axolotl.datasets.TokenizedPromptDataset" class="level3">
 <h3 class="anchored" data-anchor-id="axolotl.datasets.TokenizedPromptDataset">TokenizedPromptDataset</h3>
-<div class="sourceCode" id="cb2"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a>datasets.TokenizedPromptDataset(</span>
-<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a>    prompt_tokenizer,</span>
-<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a>    dataset,</span>
-<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a>    process_count<span class="op">=</span><span class="va">None</span>,</span>
-<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a>    keep_in_memory<span class="op">=</span><span class="va">False</span>,</span>
-<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a>    <span class="op">**</span>kwargs,</span>
-<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>datasets.TokenizedPromptDataset(</span>
+<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>    prompt_tokenizer,</span>
+<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>    dataset,</span>
+<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>    process_count<span class="op">=</span><span class="va">None</span>,</span>
+<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>    keep_in_memory<span class="op">=</span><span class="va">False</span>,</span>
+<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>    <span class="op">**</span>kwargs,</span>
+<span id="cb1-7"><a href="#cb1-7" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Dataset that returns tokenized prompts from a stream of text files.</p>
-<section id="parameters-1" class="level4 doc-section doc-section-parameters">
-<h4 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters-1">Parameters</h4>
+<section id="parameters" class="level4 doc-section doc-section-parameters">
+<h4 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters">Parameters</h4>
 <table class="caption-top table">
 <colgroup>
 <col style="width: 16%">
diff --git a/docs/api/evaluate.html b/docs/api/evaluate.html
index dd22b9949..cc662120b 100644
--- a/docs/api/evaluate.html
+++ b/docs/api/evaluate.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/index.html b/docs/api/index.html
index 55545b171..078b79eb5 100644
--- a/docs/api/index.html
+++ b/docs/api/index.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -501,7 +507,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </tr>
 <tr class="odd">
 <td><a href="../../docs/api/datasets.html#axolotl.datasets">datasets</a></td>
-<td>Module containing Dataset functionality</td>
+<td>Module containing dataset functionality.</td>
 </tr>
 <tr class="even">
 <td><a href="../../docs/api/convert.html#axolotl.convert">convert</a></td>
@@ -1020,8 +1026,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <td>Copied from https://github.com/iShohei220/adopt</td>
 </tr>
 <tr class="even">
-<td><a href="../../docs/api/utils.data.pretraining.html#axolotl.utils.data.pretraining">utils.data.pretraining</a></td>
-<td>data handling specific to pretraining</td>
+<td><a href="../../docs/api/utils.data.streaming.html#axolotl.utils.data.streaming">utils.data.streaming</a></td>
+<td>Data handling specific to streaming datasets.</td>
 </tr>
 <tr class="odd">
 <td><a href="../../docs/api/utils.data.sft.html#axolotl.utils.data.sft">utils.data.sft</a></td>
diff --git a/docs/api/integrations.base.html b/docs/api/integrations.base.html
index b040a4d67..c412e1f99 100644
--- a/docs/api/integrations.base.html
+++ b/docs/api/integrations.base.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.cut_cross_entropy.args.html b/docs/api/integrations.cut_cross_entropy.args.html
index 88f19923a..1762b88fa 100644
--- a/docs/api/integrations.cut_cross_entropy.args.html
+++ b/docs/api/integrations.cut_cross_entropy.args.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.grokfast.optimizer.html b/docs/api/integrations.grokfast.optimizer.html
index 837bd7fba..d9ce820ab 100644
--- a/docs/api/integrations.grokfast.optimizer.html
+++ b/docs/api/integrations.grokfast.optimizer.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.kd.trainer.html b/docs/api/integrations.kd.trainer.html
index 672e6617a..42416117b 100644
--- a/docs/api/integrations.kd.trainer.html
+++ b/docs/api/integrations.kd.trainer.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.liger.args.html b/docs/api/integrations.liger.args.html
index db0444466..995fc9819 100644
--- a/docs/api/integrations.liger.args.html
+++ b/docs/api/integrations.liger.args.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.lm_eval.args.html b/docs/api/integrations.lm_eval.args.html
index f3d7acbfd..c9235a60c 100644
--- a/docs/api/integrations.lm_eval.args.html
+++ b/docs/api/integrations.lm_eval.args.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/integrations.spectrum.args.html b/docs/api/integrations.spectrum.args.html
index 7d3d500d8..2ee9ddebb 100644
--- a/docs/api/integrations.spectrum.args.html
+++ b/docs/api/integrations.spectrum.args.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.geglu.html b/docs/api/kernels.geglu.html
index 67ef83d1b..bda2d7542 100644
--- a/docs/api/kernels.geglu.html
+++ b/docs/api/kernels.geglu.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.lora.html b/docs/api/kernels.lora.html
index 1b18ce66e..c9f5b9bda 100644
--- a/docs/api/kernels.lora.html
+++ b/docs/api/kernels.lora.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.quantize.html b/docs/api/kernels.quantize.html
index e0893a9ca..2974a5508 100644
--- a/docs/api/kernels.quantize.html
+++ b/docs/api/kernels.quantize.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.swiglu.html b/docs/api/kernels.swiglu.html
index bc43c8f74..19f26d35a 100644
--- a/docs/api/kernels.swiglu.html
+++ b/docs/api/kernels.swiglu.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/kernels.utils.html b/docs/api/kernels.utils.html
index 41c12f0fe..c938a483e 100644
--- a/docs/api/kernels.utils.html
+++ b/docs/api/kernels.utils.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.adapter.html b/docs/api/loaders.adapter.html
index d144dcd2f..6841ff7b0 100644
--- a/docs/api/loaders.adapter.html
+++ b/docs/api/loaders.adapter.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.constants.html b/docs/api/loaders.constants.html
index 73c13de98..6c09161cd 100644
--- a/docs/api/loaders.constants.html
+++ b/docs/api/loaders.constants.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.model.html b/docs/api/loaders.model.html
index 13f0f5fd8..c63b77bd1 100644
--- a/docs/api/loaders.model.html
+++ b/docs/api/loaders.model.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.patch_manager.html b/docs/api/loaders.patch_manager.html
index 61b61ecc6..02d22ce32 100644
--- a/docs/api/loaders.patch_manager.html
+++ b/docs/api/loaders.patch_manager.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.processor.html b/docs/api/loaders.processor.html
index cf5418872..a2bf330d4 100644
--- a/docs/api/loaders.processor.html
+++ b/docs/api/loaders.processor.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/loaders.tokenizer.html b/docs/api/loaders.tokenizer.html
index 4f9bc7dfd..7ec7c43e0 100644
--- a/docs/api/loaders.tokenizer.html
+++ b/docs/api/loaders.tokenizer.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/logging_config.html b/docs/api/logging_config.html
index 2b5645f58..6182d99b7 100644
--- a/docs/api/logging_config.html
+++ b/docs/api/logging_config.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/models.mamba.modeling_mamba.html b/docs/api/models.mamba.modeling_mamba.html
index 31ce963f8..991855e0e 100644
--- a/docs/api/models.mamba.modeling_mamba.html
+++ b/docs/api/models.mamba.modeling_mamba.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.btlm_attn_hijack_flash.html b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
index 1cf69172e..0353956c5 100644
--- a/docs/api/monkeypatch.btlm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.btlm_attn_hijack_flash.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.data.batch_dataset_fetcher.html b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
index 602dfbb45..062292e1e 100644
--- a/docs/api/monkeypatch.data.batch_dataset_fetcher.html
+++ b/docs/api/monkeypatch.data.batch_dataset_fetcher.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
index cac34c342..e92c890fc 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
index a3a4c6f1d..541a1d8ff 100644
--- a/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
+++ b/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.llama_attn_hijack_flash.html b/docs/api/monkeypatch.llama_attn_hijack_flash.html
index 515822ff3..a4b8f0781 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_flash.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.llama_attn_hijack_xformers.html b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
index 7a8c44db2..9eea82f64 100644
--- a/docs/api/monkeypatch.llama_attn_hijack_xformers.html
+++ b/docs/api/monkeypatch.llama_attn_hijack_xformers.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.llama_expand_mask.html b/docs/api/monkeypatch.llama_expand_mask.html
index ed3e9d11d..8b6f7cdb8 100644
--- a/docs/api/monkeypatch.llama_expand_mask.html
+++ b/docs/api/monkeypatch.llama_expand_mask.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.llama_patch_multipack.html b/docs/api/monkeypatch.llama_patch_multipack.html
index 50771b87e..94488b9f0 100644
--- a/docs/api/monkeypatch.llama_patch_multipack.html
+++ b/docs/api/monkeypatch.llama_patch_multipack.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.lora_kernels.html b/docs/api/monkeypatch.lora_kernels.html
index eae85a9ef..00f3b777d 100644
--- a/docs/api/monkeypatch.lora_kernels.html
+++ b/docs/api/monkeypatch.lora_kernels.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.mistral_attn_hijack_flash.html b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
index d10a38fcb..f15acd70b 100644
--- a/docs/api/monkeypatch.mistral_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.mistral_attn_hijack_flash.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.mixtral.html b/docs/api/monkeypatch.mixtral.html
index c29ebc768..898cea24c 100644
--- a/docs/api/monkeypatch.mixtral.html
+++ b/docs/api/monkeypatch.mixtral.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.multipack.html b/docs/api/monkeypatch.multipack.html
index 2f41a5602..bf26e9b9e 100644
--- a/docs/api/monkeypatch.multipack.html
+++ b/docs/api/monkeypatch.multipack.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.relora.html b/docs/api/monkeypatch.relora.html
index 8e13f0c89..e02cda72f 100644
--- a/docs/api/monkeypatch.relora.html
+++ b/docs/api/monkeypatch.relora.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
index dc6af78b0..171ee6479 100644
--- a/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
+++ b/docs/api/monkeypatch.stablelm_attn_hijack_flash.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.trainer_fsdp_optim.html b/docs/api/monkeypatch.trainer_fsdp_optim.html
index f3f76890e..f8ce28645 100644
--- a/docs/api/monkeypatch.trainer_fsdp_optim.html
+++ b/docs/api/monkeypatch.trainer_fsdp_optim.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.transformers_fa_utils.html b/docs/api/monkeypatch.transformers_fa_utils.html
index c4efaaab8..92bcecc57 100644
--- a/docs/api/monkeypatch.transformers_fa_utils.html
+++ b/docs/api/monkeypatch.transformers_fa_utils.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.unsloth_.html b/docs/api/monkeypatch.unsloth_.html
index 74cdaf148..cb20e6e3e 100644
--- a/docs/api/monkeypatch.unsloth_.html
+++ b/docs/api/monkeypatch.unsloth_.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/monkeypatch.utils.html b/docs/api/monkeypatch.utils.html
index e8cbf8786..0045de6d3 100644
--- a/docs/api/monkeypatch.utils.html
+++ b/docs/api/monkeypatch.utils.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.alpaca_chat.html b/docs/api/prompt_strategies.alpaca_chat.html
index 31e2d17dd..3572eac88 100644
--- a/docs/api/prompt_strategies.alpaca_chat.html
+++ b/docs/api/prompt_strategies.alpaca_chat.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.alpaca_instruct.html b/docs/api/prompt_strategies.alpaca_instruct.html
index 7b5c4d837..b08dd7a8f 100644
--- a/docs/api/prompt_strategies.alpaca_instruct.html
+++ b/docs/api/prompt_strategies.alpaca_instruct.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.alpaca_w_system.html b/docs/api/prompt_strategies.alpaca_w_system.html
index 64ffddb01..80c960c13 100644
--- a/docs/api/prompt_strategies.alpaca_w_system.html
+++ b/docs/api/prompt_strategies.alpaca_w_system.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.base.html b/docs/api/prompt_strategies.base.html
index 901d5e2b9..4cfed63b4 100644
--- a/docs/api/prompt_strategies.base.html
+++ b/docs/api/prompt_strategies.base.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.bradley_terry.llama3.html b/docs/api/prompt_strategies.bradley_terry.llama3.html
index 1088759b6..354e816ce 100644
--- a/docs/api/prompt_strategies.bradley_terry.llama3.html
+++ b/docs/api/prompt_strategies.bradley_terry.llama3.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.chat_template.html b/docs/api/prompt_strategies.chat_template.html
index 45fcae99e..defa122c8 100644
--- a/docs/api/prompt_strategies.chat_template.html
+++ b/docs/api/prompt_strategies.chat_template.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.completion.html b/docs/api/prompt_strategies.completion.html
index c34e24a1f..77d296542 100644
--- a/docs/api/prompt_strategies.completion.html
+++ b/docs/api/prompt_strategies.completion.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.chat_template.html b/docs/api/prompt_strategies.dpo.chat_template.html
index f04df6ffd..1773eebb9 100644
--- a/docs/api/prompt_strategies.dpo.chat_template.html
+++ b/docs/api/prompt_strategies.dpo.chat_template.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.chatml.html b/docs/api/prompt_strategies.dpo.chatml.html
index 21cb04a09..df4645953 100644
--- a/docs/api/prompt_strategies.dpo.chatml.html
+++ b/docs/api/prompt_strategies.dpo.chatml.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.llama3.html b/docs/api/prompt_strategies.dpo.llama3.html
index 137194c7d..742472e46 100644
--- a/docs/api/prompt_strategies.dpo.llama3.html
+++ b/docs/api/prompt_strategies.dpo.llama3.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.passthrough.html b/docs/api/prompt_strategies.dpo.passthrough.html
index 5cf7eada5..9ade433c0 100644
--- a/docs/api/prompt_strategies.dpo.passthrough.html
+++ b/docs/api/prompt_strategies.dpo.passthrough.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.user_defined.html b/docs/api/prompt_strategies.dpo.user_defined.html
index 808fa8769..f13ac5246 100644
--- a/docs/api/prompt_strategies.dpo.user_defined.html
+++ b/docs/api/prompt_strategies.dpo.user_defined.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.dpo.zephyr.html b/docs/api/prompt_strategies.dpo.zephyr.html
index 87af9fdab..f8787f199 100644
--- a/docs/api/prompt_strategies.dpo.zephyr.html
+++ b/docs/api/prompt_strategies.dpo.zephyr.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.input_output.html b/docs/api/prompt_strategies.input_output.html
index 4dd3bf566..c15700fff 100644
--- a/docs/api/prompt_strategies.input_output.html
+++ b/docs/api/prompt_strategies.input_output.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.kto.chatml.html b/docs/api/prompt_strategies.kto.chatml.html
index 4cbb7bb4d..0babd8381 100644
--- a/docs/api/prompt_strategies.kto.chatml.html
+++ b/docs/api/prompt_strategies.kto.chatml.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.kto.llama3.html b/docs/api/prompt_strategies.kto.llama3.html
index 859475c6b..d33a2d5e1 100644
--- a/docs/api/prompt_strategies.kto.llama3.html
+++ b/docs/api/prompt_strategies.kto.llama3.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.kto.user_defined.html b/docs/api/prompt_strategies.kto.user_defined.html
index dd08e85c3..06a68a713 100644
--- a/docs/api/prompt_strategies.kto.user_defined.html
+++ b/docs/api/prompt_strategies.kto.user_defined.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.llama2_chat.html b/docs/api/prompt_strategies.llama2_chat.html
index 4f6bb9c0a..f0eb83eef 100644
--- a/docs/api/prompt_strategies.llama2_chat.html
+++ b/docs/api/prompt_strategies.llama2_chat.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.messages.chat.html b/docs/api/prompt_strategies.messages.chat.html
index 6433774b2..5dcd1d01a 100644
--- a/docs/api/prompt_strategies.messages.chat.html
+++ b/docs/api/prompt_strategies.messages.chat.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.metharme.html b/docs/api/prompt_strategies.metharme.html
index dfc98d9c8..c5d9811db 100644
--- a/docs/api/prompt_strategies.metharme.html
+++ b/docs/api/prompt_strategies.metharme.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.orcamini.html b/docs/api/prompt_strategies.orcamini.html
index 9906a9fbd..697c04ab2 100644
--- a/docs/api/prompt_strategies.orcamini.html
+++ b/docs/api/prompt_strategies.orcamini.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.orpo.chat_template.html b/docs/api/prompt_strategies.orpo.chat_template.html
index a07bc8393..7ae0d68e7 100644
--- a/docs/api/prompt_strategies.orpo.chat_template.html
+++ b/docs/api/prompt_strategies.orpo.chat_template.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.pygmalion.html b/docs/api/prompt_strategies.pygmalion.html
index 7f8101cfd..7b0ef0e5c 100644
--- a/docs/api/prompt_strategies.pygmalion.html
+++ b/docs/api/prompt_strategies.pygmalion.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.stepwise_supervised.html b/docs/api/prompt_strategies.stepwise_supervised.html
index 7544c5d99..618dc6138 100644
--- a/docs/api/prompt_strategies.stepwise_supervised.html
+++ b/docs/api/prompt_strategies.stepwise_supervised.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_strategies.user_defined.html b/docs/api/prompt_strategies.user_defined.html
index d9b5797f7..e260fc17e 100644
--- a/docs/api/prompt_strategies.user_defined.html
+++ b/docs/api/prompt_strategies.user_defined.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/prompt_tokenizers.html b/docs/api/prompt_tokenizers.html
index 15c8a307b..9a5c37638 100644
--- a/docs/api/prompt_tokenizers.html
+++ b/docs/api/prompt_tokenizers.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/train.html b/docs/api/train.html
index 1c45bff35..2effba22e 100644
--- a/docs/api/train.html
+++ b/docs/api/train.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.bench.html b/docs/api/utils.bench.html
index c821d8121..377ec910e 100644
--- a/docs/api/utils.bench.html
+++ b/docs/api/utils.bench.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.comet_.html b/docs/api/utils.callbacks.comet_.html
index 4cd5d88f7..9e16b4709 100644
--- a/docs/api/utils.callbacks.comet_.html
+++ b/docs/api/utils.callbacks.comet_.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.lisa.html b/docs/api/utils.callbacks.lisa.html
index 663547074..92b81cb07 100644
--- a/docs/api/utils.callbacks.lisa.html
+++ b/docs/api/utils.callbacks.lisa.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.mlflow_.html b/docs/api/utils.callbacks.mlflow_.html
index 67eafe4d8..f1f804e84 100644
--- a/docs/api/utils.callbacks.mlflow_.html
+++ b/docs/api/utils.callbacks.mlflow_.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.perplexity.html b/docs/api/utils.callbacks.perplexity.html
index 407248d4f..acf8a02c6 100644
--- a/docs/api/utils.callbacks.perplexity.html
+++ b/docs/api/utils.callbacks.perplexity.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.profiler.html b/docs/api/utils.callbacks.profiler.html
index 97958d717..a79f7165f 100644
--- a/docs/api/utils.callbacks.profiler.html
+++ b/docs/api/utils.callbacks.profiler.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.callbacks.qat.html b/docs/api/utils.callbacks.qat.html
index 53cff3ff2..df624d90b 100644
--- a/docs/api/utils.callbacks.qat.html
+++ b/docs/api/utils.callbacks.qat.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.chat_templates.html b/docs/api/utils.chat_templates.html
index a728dc46e..c4a1a2e4c 100644
--- a/docs/api/utils.chat_templates.html
+++ b/docs/api/utils.chat_templates.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.batching.html b/docs/api/utils.collators.batching.html
index 00e6e0395..b1cf5a8fd 100644
--- a/docs/api/utils.collators.batching.html
+++ b/docs/api/utils.collators.batching.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.core.html b/docs/api/utils.collators.core.html
index 0b6a097bb..86bd8d47d 100644
--- a/docs/api/utils.collators.core.html
+++ b/docs/api/utils.collators.core.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.mamba.html b/docs/api/utils.collators.mamba.html
index 6e5f7413f..81b592c7b 100644
--- a/docs/api/utils.collators.mamba.html
+++ b/docs/api/utils.collators.mamba.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.collators.mm_chat.html b/docs/api/utils.collators.mm_chat.html
index 52e6ffeb5..8739861c0 100644
--- a/docs/api/utils.collators.mm_chat.html
+++ b/docs/api/utils.collators.mm_chat.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.ctx_managers.sequence_parallel.html b/docs/api/utils.ctx_managers.sequence_parallel.html
index d327779b8..369b44a6c 100644
--- a/docs/api/utils.ctx_managers.sequence_parallel.html
+++ b/docs/api/utils.ctx_managers.sequence_parallel.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.data.sft.html b/docs/api/utils.data.sft.html
index 3f145de11..535bad55b 100644
--- a/docs/api/utils.data.sft.html
+++ b/docs/api/utils.data.sft.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -529,21 +535,16 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 </table>
 <section id="axolotl.utils.data.sft.prepare_datasets" class="level3">
 <h3 class="anchored" data-anchor-id="axolotl.utils.data.sft.prepare_datasets">prepare_datasets</h3>
-<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>utils.data.sft.prepare_datasets(</span>
-<span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a>    cfg,</span>
-<span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a>    tokenizer,</span>
-<span id="cb1-4"><a href="#cb1-4" aria-hidden="true" tabindex="-1"></a>    processor<span class="op">=</span><span class="va">None</span>,</span>
-<span id="cb1-5"><a href="#cb1-5" aria-hidden="true" tabindex="-1"></a>    preprocess_iterable<span class="op">=</span><span class="va">False</span>,</span>
-<span id="cb1-6"><a href="#cb1-6" aria-hidden="true" tabindex="-1"></a>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<div class="sourceCode" id="cb1"><pre class="sourceCode python code-with-copy"><code class="sourceCode python"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a>utils.data.sft.prepare_datasets(cfg, tokenizer, processor<span class="op">=</span><span class="va">None</span>)</span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 <p>Prepare training and evaluation datasets based on configuration.</p>
 <section id="parameters" class="level4 doc-section doc-section-parameters">
 <h4 class="doc-section doc-section-parameters anchored" data-anchor-id="parameters">Parameters</h4>
 <table class="caption-top table">
 <colgroup>
-<col style="width: 19%">
-<col style="width: 21%">
-<col style="width: 48%">
-<col style="width: 10%">
+<col style="width: 11%">
+<col style="width: 24%">
+<col style="width: 53%">
+<col style="width: 12%">
 </colgroup>
 <thead>
 <tr class="header">
@@ -572,12 +573,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <td>Optional processor for multimodal datasets.</td>
 <td><code>None</code></td>
 </tr>
-<tr class="even">
-<td>preprocess_iterable</td>
-<td>bool</td>
-<td>Whether to use iterable preprocessing.</td>
-<td><code>False</code></td>
-</tr>
 </tbody>
 </table>
 </section>
diff --git a/docs/api/utils.data.pretraining.html b/docs/api/utils.data.streaming.html
similarity index 98%
rename from docs/api/utils.data.pretraining.html
rename to docs/api/utils.data.streaming.html
index 27de29126..159f22b7a 100644
--- a/docs/api/utils.data.pretraining.html
+++ b/docs/api/utils.data.streaming.html
@@ -7,7 +7,7 @@
 <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
 
 
-<title>utils.data.pretraining – Axolotl</title>
+<title>utils.data.streaming – Axolotl</title>
 <style>
 code{white-space: pre-wrap;}
 span.smallcaps{font-variant: small-caps;}
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -456,7 +462,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
     <h2 id="toc-title">On this page</h2>
    
   <ul>
-  <li><a href="#axolotl.utils.data.pretraining" id="toc-axolotl.utils.data.pretraining" class="nav-link active" data-scroll-target="#axolotl.utils.data.pretraining">utils.data.pretraining</a></li>
+  <li><a href="#axolotl.utils.data.streaming" id="toc-axolotl.utils.data.streaming" class="nav-link active" data-scroll-target="#axolotl.utils.data.streaming">utils.data.streaming</a></li>
   </ul>
 </nav>
     </div>
@@ -466,10 +472,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 
 
 
-<section id="axolotl.utils.data.pretraining" class="level1">
-<h1>utils.data.pretraining</h1>
-<p><code>utils.data.pretraining</code></p>
-<p>data handling specific to pretraining</p>
+<section id="axolotl.utils.data.streaming" class="level1">
+<h1>utils.data.streaming</h1>
+<p><code>utils.data.streaming</code></p>
+<p>Data handling specific to streaming datasets.</p>
 
 
 </section>
diff --git a/docs/api/utils.dict.html b/docs/api/utils.dict.html
index d0031b658..7240b498d 100644
--- a/docs/api/utils.dict.html
+++ b/docs/api/utils.dict.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.distributed.html b/docs/api/utils.distributed.html
index 9c514f9d0..d7843d2f9 100644
--- a/docs/api/utils.distributed.html
+++ b/docs/api/utils.distributed.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.freeze.html b/docs/api/utils.freeze.html
index 320f4d64d..c5d77456a 100644
--- a/docs/api/utils.freeze.html
+++ b/docs/api/utils.freeze.html
@@ -407,6 +407,12 @@ window.Quarto = {
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.lora.html b/docs/api/utils.lora.html
index 53e7dbcbf..981a15e51 100644
--- a/docs/api/utils.lora.html
+++ b/docs/api/utils.lora.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.model_shard_quant.html b/docs/api/utils.model_shard_quant.html
index f23697ec4..fc9f27f4c 100644
--- a/docs/api/utils.model_shard_quant.html
+++ b/docs/api/utils.model_shard_quant.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.optimizers.adopt.html b/docs/api/utils.optimizers.adopt.html
index bd0826f48..849948aa4 100644
--- a/docs/api/utils.optimizers.adopt.html
+++ b/docs/api/utils.optimizers.adopt.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.quantization.html b/docs/api/utils.quantization.html
index 0726fa4f4..91755aa08 100644
--- a/docs/api/utils.quantization.html
+++ b/docs/api/utils.quantization.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.samplers.multipack.html b/docs/api/utils.samplers.multipack.html
index 7da1ddbcb..3a61b1c7d 100644
--- a/docs/api/utils.samplers.multipack.html
+++ b/docs/api/utils.samplers.multipack.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schedulers.html b/docs/api/utils.schedulers.html
index 4e905eb0a..393f5e43e 100644
--- a/docs/api/utils.schedulers.html
+++ b/docs/api/utils.schedulers.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.config.html b/docs/api/utils.schemas.config.html
index 1f107a0e8..ffc3dd9fa 100644
--- a/docs/api/utils.schemas.config.html
+++ b/docs/api/utils.schemas.config.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.datasets.html b/docs/api/utils.schemas.datasets.html
index 96a0f54b3..106550495 100644
--- a/docs/api/utils.schemas.datasets.html
+++ b/docs/api/utils.schemas.datasets.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.enums.html b/docs/api/utils.schemas.enums.html
index 827ec84d3..90b478ab2 100644
--- a/docs/api/utils.schemas.enums.html
+++ b/docs/api/utils.schemas.enums.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.integrations.html b/docs/api/utils.schemas.integrations.html
index 21011447f..2149e9d0d 100644
--- a/docs/api/utils.schemas.integrations.html
+++ b/docs/api/utils.schemas.integrations.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.model.html b/docs/api/utils.schemas.model.html
index 9b97b25db..95143b296 100644
--- a/docs/api/utils.schemas.model.html
+++ b/docs/api/utils.schemas.model.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.multimodal.html b/docs/api/utils.schemas.multimodal.html
index 203356c55..f8e812ff5 100644
--- a/docs/api/utils.schemas.multimodal.html
+++ b/docs/api/utils.schemas.multimodal.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.peft.html b/docs/api/utils.schemas.peft.html
index 6f09b939e..3818c757f 100644
--- a/docs/api/utils.schemas.peft.html
+++ b/docs/api/utils.schemas.peft.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.training.html b/docs/api/utils.schemas.training.html
index 4db8df8a8..b671fa67c 100644
--- a/docs/api/utils.schemas.training.html
+++ b/docs/api/utils.schemas.training.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.trl.html b/docs/api/utils.schemas.trl.html
index 0cef41f97..867ef4c28 100644
--- a/docs/api/utils.schemas.trl.html
+++ b/docs/api/utils.schemas.trl.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.schemas.utils.html b/docs/api/utils.schemas.utils.html
index c8a321880..8ef73116e 100644
--- a/docs/api/utils.schemas.utils.html
+++ b/docs/api/utils.schemas.utils.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.tokenization.html b/docs/api/utils.tokenization.html
index b9a0d4351..920b12878 100644
--- a/docs/api/utils.tokenization.html
+++ b/docs/api/utils.tokenization.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/api/utils.trainer.html b/docs/api/utils.trainer.html
index 51cc137b9..6a634ef2e 100644
--- a/docs/api/utils.trainer.html
+++ b/docs/api/utils.trainer.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/batch_vs_grad.html b/docs/batch_vs_grad.html
index be7a9b2e9..944ea5f5a 100644
--- a/docs/batch_vs_grad.html
+++ b/docs/batch_vs_grad.html
@@ -344,6 +344,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/cli.html b/docs/cli.html
index 5fe72e504..56a096844 100644
--- a/docs/cli.html
+++ b/docs/cli.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/config-reference.html b/docs/config-reference.html
index fc9203ca5..614cf59ff 100644
--- a/docs/config-reference.html
+++ b/docs/config-reference.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
@@ -1268,582 +1274,587 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
 <span id="cb1-749"><a href="#cb1-749" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use sequential sampling for curriculum learning</span></span>
 <span id="cb1-750"><a href="#cb1-750" aria-hidden="true" tabindex="-1"></a><span class="fu">curriculum_sampling</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-751"><a href="#cb1-751" aria-hidden="true" tabindex="-1"></a><span class="fu">multipack_real_batches</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-752"><a href="#cb1-752" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to concatenate samples during pretraining</span></span>
-<span id="cb1-753"><a href="#cb1-753" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_sample_concatenation</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-754"><a href="#cb1-754" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-755"><a href="#cb1-755" aria-hidden="true" tabindex="-1"></a><span class="co"># Use batch flattening for speedups when not using sample_packing</span></span>
-<span id="cb1-756"><a href="#cb1-756" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_flattening</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
-<span id="cb1-757"><a href="#cb1-757" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-758"><a href="#cb1-758" aria-hidden="true" tabindex="-1"></a><span class="fu">use_pose</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-759"><a href="#cb1-759" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_split_on_token_ids</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
-<span id="cb1-760"><a href="#cb1-760" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_max_context_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-761"><a href="#cb1-761" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-762"><a href="#cb1-762" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-763"><a href="#cb1-763" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_buffer_size</span><span class="kw">:</span><span class="at"> int | None = 10000</span></span>
-<span id="cb1-764"><a href="#cb1-764" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to prevent cross attention for packed sequences during pretraining</span></span>
-<span id="cb1-765"><a href="#cb1-765" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_attn</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-752"><a href="#cb1-752" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-753"><a href="#cb1-753" aria-hidden="true" tabindex="-1"></a><span class="co"># Use batch flattening for speedups when not using sample_packing</span></span>
+<span id="cb1-754"><a href="#cb1-754" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_flattening</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
+<span id="cb1-755"><a href="#cb1-755" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-756"><a href="#cb1-756" aria-hidden="true" tabindex="-1"></a><span class="fu">use_pose</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-757"><a href="#cb1-757" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_split_on_token_ids</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
+<span id="cb1-758"><a href="#cb1-758" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_max_context_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-759"><a href="#cb1-759" aria-hidden="true" tabindex="-1"></a><span class="fu">pose_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-760"><a href="#cb1-760" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-761"><a href="#cb1-761" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_buffer_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-762"><a href="#cb1-762" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to prevent cross attention for packed sequences during pretraining</span></span>
+<span id="cb1-763"><a href="#cb1-763" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_attn</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-764"><a href="#cb1-764" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to concatenate samples during pretraining</span></span>
+<span id="cb1-765"><a href="#cb1-765" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_sample_concatenation</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-766"><a href="#cb1-766" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-767"><a href="#cb1-767" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use xformers attention patch https://github.com/facebookresearch/xformers</span></span>
-<span id="cb1-768"><a href="#cb1-768" aria-hidden="true" tabindex="-1"></a><span class="fu">xformers_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-769"><a href="#cb1-769" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/</span></span>
-<span id="cb1-770"><a href="#cb1-770" aria-hidden="true" tabindex="-1"></a><span class="co"># torch.nn.functional.scaled_dot_product_attention.html</span></span>
-<span id="cb1-771"><a href="#cb1-771" aria-hidden="true" tabindex="-1"></a><span class="fu">sdp_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-772"><a href="#cb1-772" aria-hidden="true" tabindex="-1"></a><span class="co"># Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf</span></span>
-<span id="cb1-773"><a href="#cb1-773" aria-hidden="true" tabindex="-1"></a><span class="fu">s2_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-774"><a href="#cb1-774" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-775"><a href="#cb1-775" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attn_compile_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-776"><a href="#cb1-776" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention</span></span>
-<span id="cb1-777"><a href="#cb1-777" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-778"><a href="#cb1-778" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention cross entropy implementation - advanced use only</span></span>
-<span id="cb1-779"><a href="#cb1-779" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-780"><a href="#cb1-780" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention rms norm implementation - advanced use only</span></span>
-<span id="cb1-781"><a href="#cb1-781" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-782"><a href="#cb1-782" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to fuse part of the MLP into a single operation</span></span>
-<span id="cb1-783"><a href="#cb1-783" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_fuse_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-784"><a href="#cb1-784" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use bettertransformers</span></span>
-<span id="cb1-785"><a href="#cb1-785" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_optimum</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-786"><a href="#cb1-786" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-787"><a href="#cb1-787" aria-hidden="true" tabindex="-1"></a><span class="fu">eager_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-788"><a href="#cb1-788" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-789"><a href="#cb1-789" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a custom attention implementation, used mostly for kernels.</span></span>
-<span id="cb1-790"><a href="#cb1-790" aria-hidden="true" tabindex="-1"></a><span class="fu">attn_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-767"><a href="#cb1-767" aria-hidden="true" tabindex="-1"></a><span class="co"># Use streaming mode for loading datasets</span></span>
+<span id="cb1-768"><a href="#cb1-768" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-769"><a href="#cb1-769" aria-hidden="true" tabindex="-1"></a><span class="co"># Buffer size for multipack streaming datasets</span></span>
+<span id="cb1-770"><a href="#cb1-770" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming_multipack_buffer_size</span><span class="kw">:</span><span class="at"> int | None = 10000</span></span>
+<span id="cb1-771"><a href="#cb1-771" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-772"><a href="#cb1-772" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use xformers attention patch https://github.com/facebookresearch/xformers</span></span>
+<span id="cb1-773"><a href="#cb1-773" aria-hidden="true" tabindex="-1"></a><span class="fu">xformers_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-774"><a href="#cb1-774" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/</span></span>
+<span id="cb1-775"><a href="#cb1-775" aria-hidden="true" tabindex="-1"></a><span class="co"># torch.nn.functional.scaled_dot_product_attention.html</span></span>
+<span id="cb1-776"><a href="#cb1-776" aria-hidden="true" tabindex="-1"></a><span class="fu">sdp_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-777"><a href="#cb1-777" aria-hidden="true" tabindex="-1"></a><span class="co"># Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf</span></span>
+<span id="cb1-778"><a href="#cb1-778" aria-hidden="true" tabindex="-1"></a><span class="fu">s2_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-779"><a href="#cb1-779" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-780"><a href="#cb1-780" aria-hidden="true" tabindex="-1"></a><span class="fu">flex_attn_compile_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-781"><a href="#cb1-781" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention</span></span>
+<span id="cb1-782"><a href="#cb1-782" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-783"><a href="#cb1-783" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention cross entropy implementation - advanced use only</span></span>
+<span id="cb1-784"><a href="#cb1-784" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-785"><a href="#cb1-785" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use flash-attention rms norm implementation - advanced use only</span></span>
+<span id="cb1-786"><a href="#cb1-786" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-787"><a href="#cb1-787" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to fuse part of the MLP into a single operation</span></span>
+<span id="cb1-788"><a href="#cb1-788" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_attn_fuse_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-789"><a href="#cb1-789" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use bettertransformers</span></span>
+<span id="cb1-790"><a href="#cb1-790" aria-hidden="true" tabindex="-1"></a><span class="fu">flash_optimum</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-791"><a href="#cb1-791" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-792"><a href="#cb1-792" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-793"><a href="#cb1-793" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-794"><a href="#cb1-794" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-795"><a href="#cb1-795" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-796"><a href="#cb1-796" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-797"><a href="#cb1-797" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-798"><a href="#cb1-798" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-799"><a href="#cb1-799" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-800"><a href="#cb1-800" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-801"><a href="#cb1-801" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-802"><a href="#cb1-802" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-803"><a href="#cb1-803" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-804"><a href="#cb1-804" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-805"><a href="#cb1-805" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
-<span id="cb1-806"><a href="#cb1-806" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
-<span id="cb1-807"><a href="#cb1-807" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-808"><a href="#cb1-808" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-809"><a href="#cb1-809" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
-<span id="cb1-810"><a href="#cb1-810" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-811"><a href="#cb1-811" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
-<span id="cb1-812"><a href="#cb1-812" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-792"><a href="#cb1-792" aria-hidden="true" tabindex="-1"></a><span class="fu">eager_attention</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-793"><a href="#cb1-793" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-794"><a href="#cb1-794" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a custom attention implementation, used mostly for kernels.</span></span>
+<span id="cb1-795"><a href="#cb1-795" aria-hidden="true" tabindex="-1"></a><span class="fu">attn_implementation</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-796"><a href="#cb1-796" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-797"><a href="#cb1-797" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_cross_entropy_loss</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-798"><a href="#cb1-798" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-799"><a href="#cb1-799" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_qkv</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-800"><a href="#cb1-800" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_lora_o</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-801"><a href="#cb1-801" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rms_norm</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-802"><a href="#cb1-802" aria-hidden="true" tabindex="-1"></a><span class="fu">unsloth_rope</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-803"><a href="#cb1-803" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-804"><a href="#cb1-804" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-805"><a href="#cb1-805" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-806"><a href="#cb1-806" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_mlp_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-807"><a href="#cb1-807" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-808"><a href="#cb1-808" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-809"><a href="#cb1-809" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_qkv_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-810"><a href="#cb1-810" aria-hidden="true" tabindex="-1"></a><span class="co"># Apply custom LoRA autograd functions and activation function Triton kernels for speed</span></span>
+<span id="cb1-811"><a href="#cb1-811" aria-hidden="true" tabindex="-1"></a><span class="co"># and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html</span></span>
+<span id="cb1-812"><a href="#cb1-812" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_o_kernel</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-813"><a href="#cb1-813" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-814"><a href="#cb1-814" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
-<span id="cb1-815"><a href="#cb1-815" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-816"><a href="#cb1-816" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-817"><a href="#cb1-817" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
-<span id="cb1-818"><a href="#cb1-818" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
-<span id="cb1-819"><a href="#cb1-819" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-820"><a href="#cb1-820" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-821"><a href="#cb1-821" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
-<span id="cb1-822"><a href="#cb1-822" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
-<span id="cb1-823"><a href="#cb1-823" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-824"><a href="#cb1-824" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-825"><a href="#cb1-825" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-826"><a href="#cb1-826" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-827"><a href="#cb1-827" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
-<span id="cb1-828"><a href="#cb1-828" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-829"><a href="#cb1-829" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
-<span id="cb1-830"><a href="#cb1-830" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-831"><a href="#cb1-831" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
-<span id="cb1-832"><a href="#cb1-832" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-833"><a href="#cb1-833" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-834"><a href="#cb1-834" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
-<span id="cb1-835"><a href="#cb1-835" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-836"><a href="#cb1-836" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
-<span id="cb1-837"><a href="#cb1-837" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-838"><a href="#cb1-838" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
-<span id="cb1-839"><a href="#cb1-839" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-840"><a href="#cb1-840" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
-<span id="cb1-841"><a href="#cb1-841" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
-<span id="cb1-842"><a href="#cb1-842" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-843"><a href="#cb1-843" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-844"><a href="#cb1-844" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
-<span id="cb1-845"><a href="#cb1-845" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-846"><a href="#cb1-846" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
-<span id="cb1-847"><a href="#cb1-847" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-848"><a href="#cb1-848" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
-<span id="cb1-849"><a href="#cb1-849" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-850"><a href="#cb1-850" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
-<span id="cb1-851"><a href="#cb1-851" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
-<span id="cb1-852"><a href="#cb1-852" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
-<span id="cb1-853"><a href="#cb1-853" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
-<span id="cb1-854"><a href="#cb1-854" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
-<span id="cb1-855"><a href="#cb1-855" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
-<span id="cb1-856"><a href="#cb1-856" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-857"><a href="#cb1-857" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
-<span id="cb1-858"><a href="#cb1-858" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
-<span id="cb1-859"><a href="#cb1-859" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-860"><a href="#cb1-860" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
-<span id="cb1-861"><a href="#cb1-861" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
-<span id="cb1-862"><a href="#cb1-862" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
-<span id="cb1-863"><a href="#cb1-863" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
-<span id="cb1-864"><a href="#cb1-864" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
-<span id="cb1-865"><a href="#cb1-865" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-866"><a href="#cb1-866" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-867"><a href="#cb1-867" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
-<span id="cb1-868"><a href="#cb1-868" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
-<span id="cb1-869"><a href="#cb1-869" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
-<span id="cb1-870"><a href="#cb1-870" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
-<span id="cb1-871"><a href="#cb1-871" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-872"><a href="#cb1-872" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-873"><a href="#cb1-873" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-874"><a href="#cb1-874" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-875"><a href="#cb1-875" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-876"><a href="#cb1-876" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-877"><a href="#cb1-877" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
-<span id="cb1-878"><a href="#cb1-878" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-879"><a href="#cb1-879" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
-<span id="cb1-880"><a href="#cb1-880" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
-<span id="cb1-881"><a href="#cb1-881" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
-<span id="cb1-882"><a href="#cb1-882" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
-<span id="cb1-883"><a href="#cb1-883" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-884"><a href="#cb1-884" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
-<span id="cb1-885"><a href="#cb1-885" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
-<span id="cb1-886"><a href="#cb1-886" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
-<span id="cb1-887"><a href="#cb1-887" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
-<span id="cb1-888"><a href="#cb1-888" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-889"><a href="#cb1-889" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
-<span id="cb1-890"><a href="#cb1-890" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-891"><a href="#cb1-891" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
-<span id="cb1-892"><a href="#cb1-892" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
-<span id="cb1-893"><a href="#cb1-893" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
-<span id="cb1-894"><a href="#cb1-894" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-895"><a href="#cb1-895" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
-<span id="cb1-896"><a href="#cb1-896" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-897"><a href="#cb1-897" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
-<span id="cb1-898"><a href="#cb1-898" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-899"><a href="#cb1-899" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-900"><a href="#cb1-900" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-901"><a href="#cb1-901" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-902"><a href="#cb1-902" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
-<span id="cb1-903"><a href="#cb1-903" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-904"><a href="#cb1-904" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
-<span id="cb1-905"><a href="#cb1-905" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
-<span id="cb1-906"><a href="#cb1-906" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-907"><a href="#cb1-907" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-908"><a href="#cb1-908" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
-<span id="cb1-909"><a href="#cb1-909" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
-<span id="cb1-910"><a href="#cb1-910" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
-<span id="cb1-911"><a href="#cb1-911" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
-<span id="cb1-912"><a href="#cb1-912" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-913"><a href="#cb1-913" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
-<span id="cb1-914"><a href="#cb1-914" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
-<span id="cb1-915"><a href="#cb1-915" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-916"><a href="#cb1-916" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
-<span id="cb1-917"><a href="#cb1-917" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-918"><a href="#cb1-918" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
-<span id="cb1-919"><a href="#cb1-919" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-920"><a href="#cb1-920" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-921"><a href="#cb1-921" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
-<span id="cb1-922"><a href="#cb1-922" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-923"><a href="#cb1-923" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
-<span id="cb1-924"><a href="#cb1-924" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
-<span id="cb1-925"><a href="#cb1-925" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
-<span id="cb1-926"><a href="#cb1-926" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-927"><a href="#cb1-927" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-928"><a href="#cb1-928" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
-<span id="cb1-929"><a href="#cb1-929" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
-<span id="cb1-930"><a href="#cb1-930" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-931"><a href="#cb1-931" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
-<span id="cb1-932"><a href="#cb1-932" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-933"><a href="#cb1-933" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
-<span id="cb1-934"><a href="#cb1-934" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
-<span id="cb1-935"><a href="#cb1-935" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
-<span id="cb1-936"><a href="#cb1-936" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-937"><a href="#cb1-937" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
-<span id="cb1-938"><a href="#cb1-938" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
-<span id="cb1-939"><a href="#cb1-939" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
-<span id="cb1-940"><a href="#cb1-940" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
-<span id="cb1-941"><a href="#cb1-941" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-942"><a href="#cb1-942" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second during training by measuring throughput of</span></span>
-<span id="cb1-943"><a href="#cb1-943" aria-hidden="true" tabindex="-1"></a><span class="co"># non-padding tokens.</span></span>
-<span id="cb1-944"><a href="#cb1-944" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-945"><a href="#cb1-945" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
-<span id="cb1-946"><a href="#cb1-946" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
-<span id="cb1-947"><a href="#cb1-947" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-948"><a href="#cb1-948" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-949"><a href="#cb1-949" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
-<span id="cb1-950"><a href="#cb1-950" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
-<span id="cb1-951"><a href="#cb1-951" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-952"><a href="#cb1-952" aria-hidden="true" tabindex="-1"></a><span class="co"># Weighting of NLL term in loss from RPO paper</span></span>
-<span id="cb1-953"><a href="#cb1-953" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-954"><a href="#cb1-954" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
-<span id="cb1-955"><a href="#cb1-955" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-956"><a href="#cb1-956" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
-<span id="cb1-957"><a href="#cb1-957" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-958"><a href="#cb1-958" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-959"><a href="#cb1-959" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
-<span id="cb1-960"><a href="#cb1-960" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-961"><a href="#cb1-961" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
-<span id="cb1-962"><a href="#cb1-962" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-963"><a href="#cb1-963" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
-<span id="cb1-964"><a href="#cb1-964" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-965"><a href="#cb1-965" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-966"><a href="#cb1-966" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
-<span id="cb1-967"><a href="#cb1-967" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
-<span id="cb1-968"><a href="#cb1-968" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
-<span id="cb1-969"><a href="#cb1-969" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
-<span id="cb1-970"><a href="#cb1-970" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
-<span id="cb1-971"><a href="#cb1-971" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
-<span id="cb1-972"><a href="#cb1-972" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
-<span id="cb1-973"><a href="#cb1-973" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-974"><a href="#cb1-974" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-975"><a href="#cb1-975" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
-<span id="cb1-976"><a href="#cb1-976" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
-<span id="cb1-977"><a href="#cb1-977" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
-<span id="cb1-978"><a href="#cb1-978" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
-<span id="cb1-979"><a href="#cb1-979" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
-<span id="cb1-980"><a href="#cb1-980" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
-<span id="cb1-981"><a href="#cb1-981" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
-<span id="cb1-982"><a href="#cb1-982" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
-<span id="cb1-983"><a href="#cb1-983" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
-<span id="cb1-984"><a href="#cb1-984" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
-<span id="cb1-985"><a href="#cb1-985" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
-<span id="cb1-986"><a href="#cb1-986" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
-<span id="cb1-987"><a href="#cb1-987" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
-<span id="cb1-988"><a href="#cb1-988" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
-<span id="cb1-989"><a href="#cb1-989" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
-<span id="cb1-990"><a href="#cb1-990" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
-<span id="cb1-991"><a href="#cb1-991" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-992"><a href="#cb1-992" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
-<span id="cb1-993"><a href="#cb1-993" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
-<span id="cb1-994"><a href="#cb1-994" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
-<span id="cb1-995"><a href="#cb1-995" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-996"><a href="#cb1-996" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
-<span id="cb1-997"><a href="#cb1-997" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
-<span id="cb1-998"><a href="#cb1-998" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
-<span id="cb1-999"><a href="#cb1-999" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
-<span id="cb1-1000"><a href="#cb1-1000" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1001"><a href="#cb1-1001" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
-<span id="cb1-1002"><a href="#cb1-1002" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1003"><a href="#cb1-1003" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1004"><a href="#cb1-1004" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
-<span id="cb1-1005"><a href="#cb1-1005" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-814"><a href="#cb1-814" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use chunked cross entropy loss for memory efficiency</span></span>
+<span id="cb1-815"><a href="#cb1-815" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-816"><a href="#cb1-816" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of chunks to use for chunked cross entropy loss</span></span>
+<span id="cb1-817"><a href="#cb1-817" aria-hidden="true" tabindex="-1"></a><span class="fu">chunked_cross_entropy_num_chunks</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-818"><a href="#cb1-818" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-819"><a href="#cb1-819" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ALST tiled mlp for memory efficient long context</span></span>
+<span id="cb1-820"><a href="#cb1-820" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-821"><a href="#cb1-821" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-822"><a href="#cb1-822" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of shards to use for ALST tiled mlp. If unset, it will be set based on</span></span>
+<span id="cb1-823"><a href="#cb1-823" aria-hidden="true" tabindex="-1"></a><span class="co"># seqlen/hidden_size</span></span>
+<span id="cb1-824"><a href="#cb1-824" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_num_shards</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-825"><a href="#cb1-825" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-826"><a href="#cb1-826" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on</span></span>
+<span id="cb1-827"><a href="#cb1-827" aria-hidden="true" tabindex="-1"></a><span class="co"># llama.</span></span>
+<span id="cb1-828"><a href="#cb1-828" aria-hidden="true" tabindex="-1"></a><span class="fu">tiled_mlp_use_original_mlp</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-829"><a href="#cb1-829" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-830"><a href="#cb1-830" aria-hidden="true" tabindex="-1"></a><span class="fu">llama4_linearized_experts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-831"><a href="#cb1-831" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-832"><a href="#cb1-832" aria-hidden="true" tabindex="-1"></a><span class="co"># Deepspeed config path. e.g., deepspeed_configs/zero3.json</span></span>
+<span id="cb1-833"><a href="#cb1-833" aria-hidden="true" tabindex="-1"></a><span class="fu">deepspeed</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-834"><a href="#cb1-834" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use deepcompile for faster training with deepspeed</span></span>
+<span id="cb1-835"><a href="#cb1-835" aria-hidden="true" tabindex="-1"></a><span class="fu">deepcompile</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-836"><a href="#cb1-836" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration</span></span>
+<span id="cb1-837"><a href="#cb1-837" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-838"><a href="#cb1-838" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-839"><a href="#cb1-839" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP configuration options</span></span>
+<span id="cb1-840"><a href="#cb1-840" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-841"><a href="#cb1-841" aria-hidden="true" tabindex="-1"></a><span class="co"># FSDP version</span></span>
+<span id="cb1-842"><a href="#cb1-842" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_version</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-843"><a href="#cb1-843" aria-hidden="true" tabindex="-1"></a><span class="fu">fsdp_final_state_dict_type</span><span class="kw">:</span><span class="at"> Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None</span></span>
+<span id="cb1-844"><a href="#cb1-844" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-845"><a href="#cb1-845" aria-hidden="true" tabindex="-1"></a><span class="co"># How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for</span></span>
+<span id="cb1-846"><a href="#cb1-846" aria-hidden="true" tabindex="-1"></a><span class="co"># no eval.</span></span>
+<span id="cb1-847"><a href="#cb1-847" aria-hidden="true" tabindex="-1"></a><span class="fu">val_set_size</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-848"><a href="#cb1-848" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-849"><a href="#cb1-849" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to shard across. If not set, will use all available devices.</span></span>
+<span id="cb1-850"><a href="#cb1-850" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_shard_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-851"><a href="#cb1-851" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of devices to replicate across.</span></span>
+<span id="cb1-852"><a href="#cb1-852" aria-hidden="true" tabindex="-1"></a><span class="fu">dp_replicate_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-853"><a href="#cb1-853" aria-hidden="true" tabindex="-1"></a><span class="co"># Deprecated: use `context_parallel_size` instead</span></span>
+<span id="cb1-854"><a href="#cb1-854" aria-hidden="true" tabindex="-1"></a><span class="fu">sequence_parallel_degree</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-855"><a href="#cb1-855" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to a divisor of the number of GPUs available to split sequences into chunks of</span></span>
+<span id="cb1-856"><a href="#cb1-856" aria-hidden="true" tabindex="-1"></a><span class="co"># equal size. Use in long context training to prevent OOM when sequences cannot fit into</span></span>
+<span id="cb1-857"><a href="#cb1-857" aria-hidden="true" tabindex="-1"></a><span class="co"># a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each</span></span>
+<span id="cb1-858"><a href="#cb1-858" aria-hidden="true" tabindex="-1"></a><span class="co"># sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized</span></span>
+<span id="cb1-859"><a href="#cb1-859" aria-hidden="true" tabindex="-1"></a><span class="co"># subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more</span></span>
+<span id="cb1-860"><a href="#cb1-860" aria-hidden="true" tabindex="-1"></a><span class="co"># details.</span></span>
+<span id="cb1-861"><a href="#cb1-861" aria-hidden="true" tabindex="-1"></a><span class="fu">context_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-862"><a href="#cb1-862" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional; strides across the key dimension. Larger values use more memory but should</span></span>
+<span id="cb1-863"><a href="#cb1-863" aria-hidden="true" tabindex="-1"></a><span class="co"># make training faster. Must evenly divide the number of KV heads in your model.</span></span>
+<span id="cb1-864"><a href="#cb1-864" aria-hidden="true" tabindex="-1"></a><span class="fu">heads_k_stride</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-865"><a href="#cb1-865" aria-hidden="true" tabindex="-1"></a><span class="co"># One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to</span></span>
+<span id="cb1-866"><a href="#cb1-866" aria-hidden="true" tabindex="-1"></a><span class="co"># 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing</span></span>
+<span id="cb1-867"><a href="#cb1-867" aria-hidden="true" tabindex="-1"></a><span class="co"># case.</span></span>
+<span id="cb1-868"><a href="#cb1-868" aria-hidden="true" tabindex="-1"></a><span class="fu">ring_attn_func</span><span class="kw">:</span><span class="at"> RingAttnFunc | None</span></span>
+<span id="cb1-869"><a href="#cb1-869" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.</span></span>
+<span id="cb1-870"><a href="#cb1-870" aria-hidden="true" tabindex="-1"></a><span class="fu">tensor_parallel_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-871"><a href="#cb1-871" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-872"><a href="#cb1-872" aria-hidden="true" tabindex="-1"></a><span class="co"># Add or change special tokens. If you add tokens here, you don't need to add them to</span></span>
+<span id="cb1-873"><a href="#cb1-873" aria-hidden="true" tabindex="-1"></a><span class="co"># the `tokens` list.</span></span>
+<span id="cb1-874"><a href="#cb1-874" aria-hidden="true" tabindex="-1"></a><span class="fu">special_tokens</span><span class="kw">:</span><span class="at"> SpecialTokensConfig | None</span></span>
+<span id="cb1-875"><a href="#cb1-875" aria-hidden="true" tabindex="-1"></a><span class="co">  # For SpecialTokensConfig:</span></span>
+<span id="cb1-876"><a href="#cb1-876" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">bos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-877"><a href="#cb1-877" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">eos_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-878"><a href="#cb1-878" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">pad_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-879"><a href="#cb1-879" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">unk_token</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-880"><a href="#cb1-880" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">additional_special_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-881"><a href="#cb1-881" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-882"><a href="#cb1-882" aria-hidden="true" tabindex="-1"></a><span class="co"># Add extra tokens to the tokenizer</span></span>
+<span id="cb1-883"><a href="#cb1-883" aria-hidden="true" tabindex="-1"></a><span class="fu">tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-884"><a href="#cb1-884" aria-hidden="true" tabindex="-1"></a><span class="co"># Mapping token_id to new_token_string to override reserved added_tokens in the</span></span>
+<span id="cb1-885"><a href="#cb1-885" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer. Only works for tokens that are not part of the base vocab (aka are</span></span>
+<span id="cb1-886"><a href="#cb1-886" aria-hidden="true" tabindex="-1"></a><span class="co"># added_tokens). Can be checked if they exist in tokenizer.json added_tokens.</span></span>
+<span id="cb1-887"><a href="#cb1-887" aria-hidden="true" tabindex="-1"></a><span class="fu">added_tokens_overrides</span><span class="kw">:</span><span class="at"> dict[int, str] | None</span></span>
+<span id="cb1-888"><a href="#cb1-888" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-889"><a href="#cb1-889" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use torch.compile and which backend to use. setting to `auto` will enable</span></span>
+<span id="cb1-890"><a href="#cb1-890" aria-hidden="true" tabindex="-1"></a><span class="co"># torch compile when torch&gt;=2.6.0</span></span>
+<span id="cb1-891"><a href="#cb1-891" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile</span><span class="kw">:</span><span class="at"> Literal['auto'] | bool | None</span></span>
+<span id="cb1-892"><a href="#cb1-892" aria-hidden="true" tabindex="-1"></a><span class="co"># Backend to use for torch.compile</span></span>
+<span id="cb1-893"><a href="#cb1-893" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_backend</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-894"><a href="#cb1-894" aria-hidden="true" tabindex="-1"></a><span class="fu">torch_compile_mode</span><span class="kw">:</span><span class="at"> Literal['default', 'reduce-overhead', 'max-autotune'] | None</span></span>
+<span id="cb1-895"><a href="#cb1-895" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-896"><a href="#cb1-896" aria-hidden="true" tabindex="-1"></a><span class="co"># Maximum number of iterations to train for. It precedes num_epochs which means that if</span></span>
+<span id="cb1-897"><a href="#cb1-897" aria-hidden="true" tabindex="-1"></a><span class="co"># both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;</span></span>
+<span id="cb1-898"><a href="#cb1-898" aria-hidden="true" tabindex="-1"></a><span class="co"># `num_epochs: 2` and `max_steps: 100` will train for 100 steps</span></span>
+<span id="cb1-899"><a href="#cb1-899" aria-hidden="true" tabindex="-1"></a><span class="fu">max_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-900"><a href="#cb1-900" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of warmup steps. Cannot use with warmup_ratio</span></span>
+<span id="cb1-901"><a href="#cb1-901" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-902"><a href="#cb1-902" aria-hidden="true" tabindex="-1"></a><span class="co"># Warmup ratio. Cannot use with warmup_steps</span></span>
+<span id="cb1-903"><a href="#cb1-903" aria-hidden="true" tabindex="-1"></a><span class="fu">warmup_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-904"><a href="#cb1-904" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to eval at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-905"><a href="#cb1-905" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-906"><a href="#cb1-906" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-907"><a href="#cb1-907" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to run evals, mutually exclusive with eval_steps</span></span>
+<span id="cb1-908"><a href="#cb1-908" aria-hidden="true" tabindex="-1"></a><span class="fu">evals_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-909"><a href="#cb1-909" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer</span></span>
+<span id="cb1-910"><a href="#cb1-910" aria-hidden="true" tabindex="-1"></a><span class="co"># from `eval_steps`</span></span>
+<span id="cb1-911"><a href="#cb1-911" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-912"><a href="#cb1-912" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-913"><a href="#cb1-913" aria-hidden="true" tabindex="-1"></a><span class="co"># Leave empty to save at each epoch, integer for every N steps. float for fraction of</span></span>
+<span id="cb1-914"><a href="#cb1-914" aria-hidden="true" tabindex="-1"></a><span class="co"># total steps</span></span>
+<span id="cb1-915"><a href="#cb1-915" aria-hidden="true" tabindex="-1"></a><span class="fu">save_steps</span><span class="kw">:</span><span class="at"> int | float | None</span></span>
+<span id="cb1-916"><a href="#cb1-916" aria-hidden="true" tabindex="-1"></a><span class="co"># Number of times per epoch to save a checkpoint, mutually exclusive with save_steps</span></span>
+<span id="cb1-917"><a href="#cb1-917" aria-hidden="true" tabindex="-1"></a><span class="fu">saves_per_epoch</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-918"><a href="#cb1-918" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better</span></span>
+<span id="cb1-919"><a href="#cb1-919" aria-hidden="true" tabindex="-1"></a><span class="co"># result is achieved, leave empty to infer from `save_steps`</span></span>
+<span id="cb1-920"><a href="#cb1-920" aria-hidden="true" tabindex="-1"></a><span class="fu">save_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-921"><a href="#cb1-921" aria-hidden="true" tabindex="-1"></a><span class="co"># Checkpoints saved at a time</span></span>
+<span id="cb1-922"><a href="#cb1-922" aria-hidden="true" tabindex="-1"></a><span class="fu">save_total_limit</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-923"><a href="#cb1-923" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to checkpoint a model after the first step of training. Defaults to False.</span></span>
+<span id="cb1-924"><a href="#cb1-924" aria-hidden="true" tabindex="-1"></a><span class="fu">save_first_step</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-925"><a href="#cb1-925" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-926"><a href="#cb1-926" aria-hidden="true" tabindex="-1"></a><span class="co"># Logging frequency</span></span>
+<span id="cb1-927"><a href="#cb1-927" aria-hidden="true" tabindex="-1"></a><span class="fu">logging_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-928"><a href="#cb1-928" aria-hidden="true" tabindex="-1"></a><span class="co"># Stop training after this many evaluation losses have increased in a row. https://huggi</span></span>
+<span id="cb1-929"><a href="#cb1-929" aria-hidden="true" tabindex="-1"></a><span class="co"># ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin</span></span>
+<span id="cb1-930"><a href="#cb1-930" aria-hidden="true" tabindex="-1"></a><span class="co"># gCallback</span></span>
+<span id="cb1-931"><a href="#cb1-931" aria-hidden="true" tabindex="-1"></a><span class="fu">early_stopping_patience</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-932"><a href="#cb1-932" aria-hidden="true" tabindex="-1"></a><span class="fu">load_best_model_at_end</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-933"><a href="#cb1-933" aria-hidden="true" tabindex="-1"></a><span class="co"># Save only the model weights, skipping the optimizer. Using this means you can't resume</span></span>
+<span id="cb1-934"><a href="#cb1-934" aria-hidden="true" tabindex="-1"></a><span class="co"># from checkpoints.</span></span>
+<span id="cb1-935"><a href="#cb1-935" aria-hidden="true" tabindex="-1"></a><span class="fu">save_only_model</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-936"><a href="#cb1-936" aria-hidden="true" tabindex="-1"></a><span class="co"># Use tensorboard for logging</span></span>
+<span id="cb1-937"><a href="#cb1-937" aria-hidden="true" tabindex="-1"></a><span class="fu">use_tensorboard</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-938"><a href="#cb1-938" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable the pytorch profiler to capture the first N steps of training to the</span></span>
+<span id="cb1-939"><a href="#cb1-939" aria-hidden="true" tabindex="-1"></a><span class="co"># output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more</span></span>
+<span id="cb1-940"><a href="#cb1-940" aria-hidden="true" tabindex="-1"></a><span class="co"># information. Snapshots can be visualized @ https://pytorch.org/memory_viz</span></span>
+<span id="cb1-941"><a href="#cb1-941" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-942"><a href="#cb1-942" aria-hidden="true" tabindex="-1"></a><span class="co"># Which step to start the profiler at. Useful for only capturing a few steps mid-run.</span></span>
+<span id="cb1-943"><a href="#cb1-943" aria-hidden="true" tabindex="-1"></a><span class="fu">profiler_steps_start</span><span class="kw">:</span><span class="at"> int | None = 0</span></span>
+<span id="cb1-944"><a href="#cb1-944" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second at the end of training. This is not</span></span>
+<span id="cb1-945"><a href="#cb1-945" aria-hidden="true" tabindex="-1"></a><span class="co"># supported with pre-training datasets.</span></span>
+<span id="cb1-946"><a href="#cb1-946" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tokens_per_second</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-947"><a href="#cb1-947" aria-hidden="true" tabindex="-1"></a><span class="co"># bool of whether to report tokens per second during training by measuring throughput of</span></span>
+<span id="cb1-948"><a href="#cb1-948" aria-hidden="true" tabindex="-1"></a><span class="co"># non-padding tokens.</span></span>
+<span id="cb1-949"><a href="#cb1-949" aria-hidden="true" tabindex="-1"></a><span class="fu">include_tkps</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-950"><a href="#cb1-950" aria-hidden="true" tabindex="-1"></a><span class="co"># NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to</span></span>
+<span id="cb1-951"><a href="#cb1-951" aria-hidden="true" tabindex="-1"></a><span class="co"># add noise to embeddings. Currently only supported on Llama and Mistral</span></span>
+<span id="cb1-952"><a href="#cb1-952" aria-hidden="true" tabindex="-1"></a><span class="fu">neftune_noise_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-953"><a href="#cb1-953" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-954"><a href="#cb1-954" aria-hidden="true" tabindex="-1"></a><span class="co"># Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to</span></span>
+<span id="cb1-955"><a href="#cb1-955" aria-hidden="true" tabindex="-1"></a><span class="co"># `beta` in `ORPOConfig` due to trl mapping.</span></span>
+<span id="cb1-956"><a href="#cb1-956" aria-hidden="true" tabindex="-1"></a><span class="fu">orpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-957"><a href="#cb1-957" aria-hidden="true" tabindex="-1"></a><span class="co"># Weighting of NLL term in loss from RPO paper</span></span>
+<span id="cb1-958"><a href="#cb1-958" aria-hidden="true" tabindex="-1"></a><span class="fu">rpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-959"><a href="#cb1-959" aria-hidden="true" tabindex="-1"></a><span class="co"># Target reward margin for the SimPO loss</span></span>
+<span id="cb1-960"><a href="#cb1-960" aria-hidden="true" tabindex="-1"></a><span class="fu">simpo_gamma</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-961"><a href="#cb1-961" aria-hidden="true" tabindex="-1"></a><span class="co"># Weight of the BC regularizer</span></span>
+<span id="cb1-962"><a href="#cb1-962" aria-hidden="true" tabindex="-1"></a><span class="fu">cpo_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-963"><a href="#cb1-963" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-964"><a href="#cb1-964" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for desirable loss term in KTO loss</span></span>
+<span id="cb1-965"><a href="#cb1-965" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_desirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-966"><a href="#cb1-966" aria-hidden="true" tabindex="-1"></a><span class="co"># Factor for undesirable loss term in KTO loss</span></span>
+<span id="cb1-967"><a href="#cb1-967" aria-hidden="true" tabindex="-1"></a><span class="fu">kto_undesirable_weight</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-968"><a href="#cb1-968" aria-hidden="true" tabindex="-1"></a><span class="co"># The beta parameter for the RL training</span></span>
+<span id="cb1-969"><a href="#cb1-969" aria-hidden="true" tabindex="-1"></a><span class="fu">rl_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-970"><a href="#cb1-970" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-971"><a href="#cb1-971" aria-hidden="true" tabindex="-1"></a><span class="co"># Defines the max memory usage per gpu on the system. Passed through to transformers</span></span>
+<span id="cb1-972"><a href="#cb1-972" aria-hidden="true" tabindex="-1"></a><span class="co"># when loading the model.</span></span>
+<span id="cb1-973"><a href="#cb1-973" aria-hidden="true" tabindex="-1"></a><span class="fu">max_memory</span><span class="kw">:</span><span class="at"> dict[int | Literal['cpu', 'disk'], int | str] | None</span></span>
+<span id="cb1-974"><a href="#cb1-974" aria-hidden="true" tabindex="-1"></a><span class="co"># Limit the memory for all available GPUs to this amount (if an integer, expressed in</span></span>
+<span id="cb1-975"><a href="#cb1-975" aria-hidden="true" tabindex="-1"></a><span class="co"># gigabytes); default: unset</span></span>
+<span id="cb1-976"><a href="#cb1-976" aria-hidden="true" tabindex="-1"></a><span class="fu">gpu_memory_limit</span><span class="kw">:</span><span class="at"> int | str | None</span></span>
+<span id="cb1-977"><a href="#cb1-977" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use low_cpu_mem_usage</span></span>
+<span id="cb1-978"><a href="#cb1-978" aria-hidden="true" tabindex="-1"></a><span class="fu">low_cpu_mem_usage</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-979"><a href="#cb1-979" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-980"><a href="#cb1-980" aria-hidden="true" tabindex="-1"></a><span class="co"># The name of the chat template to use for training, following values are supported:</span></span>
+<span id="cb1-981"><a href="#cb1-981" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default: Uses the chat template that is available in the</span></span>
+<span id="cb1-982"><a href="#cb1-982" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_config.json. If the chat template is not available in the tokenizer, it will</span></span>
+<span id="cb1-983"><a href="#cb1-983" aria-hidden="true" tabindex="-1"></a><span class="co"># raise an error. This is the default value.</span></span>
+<span id="cb1-984"><a href="#cb1-984" aria-hidden="true" tabindex="-1"></a><span class="co"># alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates</span></span>
+<span id="cb1-985"><a href="#cb1-985" aria-hidden="true" tabindex="-1"></a><span class="co"># are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.</span></span>
+<span id="cb1-986"><a href="#cb1-986" aria-hidden="true" tabindex="-1"></a><span class="co"># tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.</span></span>
+<span id="cb1-987"><a href="#cb1-987" aria-hidden="true" tabindex="-1"></a><span class="co"># E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not</span></span>
+<span id="cb1-988"><a href="#cb1-988" aria-hidden="true" tabindex="-1"></a><span class="co"># available in the tokenizer. jinja: Uses a custom jinja template for the chat template.</span></span>
+<span id="cb1-989"><a href="#cb1-989" aria-hidden="true" tabindex="-1"></a><span class="co"># The custom jinja template should be provided in the chat_template_jinja field. The</span></span>
+<span id="cb1-990"><a href="#cb1-990" aria-hidden="true" tabindex="-1"></a><span class="co"># selected chat template will be saved to the tokenizer_config.json for easier</span></span>
+<span id="cb1-991"><a href="#cb1-991" aria-hidden="true" tabindex="-1"></a><span class="co"># inferencing</span></span>
+<span id="cb1-992"><a href="#cb1-992" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template</span><span class="kw">:</span><span class="at"> ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None</span></span>
+<span id="cb1-993"><a href="#cb1-993" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom jinja template or path to jinja file for chat template. This will be only used</span></span>
+<span id="cb1-994"><a href="#cb1-994" aria-hidden="true" tabindex="-1"></a><span class="co"># if chat_template is set to `jinja` or `null` (in which case chat_template is</span></span>
+<span id="cb1-995"><a href="#cb1-995" aria-hidden="true" tabindex="-1"></a><span class="co"># automatically set to `jinja`). Default is null.</span></span>
+<span id="cb1-996"><a href="#cb1-996" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_jinja</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-997"><a href="#cb1-997" aria-hidden="true" tabindex="-1"></a><span class="co"># Additional kwargs to pass to the chat template. This is useful for customizing the</span></span>
+<span id="cb1-998"><a href="#cb1-998" aria-hidden="true" tabindex="-1"></a><span class="co"># chat template. For example, you can pass `thinking=False` to add a generation prompt</span></span>
+<span id="cb1-999"><a href="#cb1-999" aria-hidden="true" tabindex="-1"></a><span class="co"># to the chat template.</span></span>
+<span id="cb1-1000"><a href="#cb1-1000" aria-hidden="true" tabindex="-1"></a><span class="fu">chat_template_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1001"><a href="#cb1-1001" aria-hidden="true" tabindex="-1"></a><span class="co"># Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the</span></span>
+<span id="cb1-1002"><a href="#cb1-1002" aria-hidden="true" tabindex="-1"></a><span class="co"># boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',</span></span>
+<span id="cb1-1003"><a href="#cb1-1003" aria-hidden="true" tabindex="-1"></a><span class="co"># '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is</span></span>
+<span id="cb1-1004"><a href="#cb1-1004" aria-hidden="true" tabindex="-1"></a><span class="co"># useful for templates that use multiple delimiter tokens.</span></span>
+<span id="cb1-1005"><a href="#cb1-1005" aria-hidden="true" tabindex="-1"></a><span class="fu">eot_tokens</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1006"><a href="#cb1-1006" aria-hidden="true" tabindex="-1"></a><span class="co"># Changes the default system message. Currently only supports chatml.</span></span>
+<span id="cb1-1007"><a href="#cb1-1007" aria-hidden="true" tabindex="-1"></a><span class="fu">default_system_message</span><span class="kw">:</span><span class="at"> str | None</span></span>
 <span id="cb1-1008"><a href="#cb1-1008" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
-<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
-<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
-<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
-<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
-<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1009"><a href="#cb1-1009" aria-hidden="true" tabindex="-1"></a><span class="fu">fix_untrained_tokens</span><span class="kw">:</span><span class="at"> int | list[int] | None</span></span>
+<span id="cb1-1010"><a href="#cb1-1010" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1011"><a href="#cb1-1011" aria-hidden="true" tabindex="-1"></a><span class="fu">is_preprocess</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1012"><a href="#cb1-1012" aria-hidden="true" tabindex="-1"></a><span class="fu">preprocess_iterable</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1013"><a href="#cb1-1013" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1014"><a href="#cb1-1014" aria-hidden="true" tabindex="-1"></a><span class="co"># Total number of tokens - internal use</span></span>
+<span id="cb1-1015"><a href="#cb1-1015" aria-hidden="true" tabindex="-1"></a><span class="fu">total_num_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1016"><a href="#cb1-1016" aria-hidden="true" tabindex="-1"></a><span class="fu">total_supervised_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1017"><a href="#cb1-1017" aria-hidden="true" tabindex="-1"></a><span class="co"># You can set these packing optimizations AFTER starting a training at least once. The</span></span>
+<span id="cb1-1018"><a href="#cb1-1018" aria-hidden="true" tabindex="-1"></a><span class="co"># trainer will provide recommended values for these values.</span></span>
+<span id="cb1-1019"><a href="#cb1-1019" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing_eff_est</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1020"><a href="#cb1-1020" aria-hidden="true" tabindex="-1"></a><span class="fu">axolotl_config_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1021"><a href="#cb1-1021" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1022"><a href="#cb1-1022" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1023"><a href="#cb1-1023" aria-hidden="true" tabindex="-1"></a><span class="fu">is_falcon_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-1024"><a href="#cb1-1024" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
-<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
-<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
-<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
-<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1025"><a href="#cb1-1025" aria-hidden="true" tabindex="-1"></a><span class="fu">is_llama_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1026"><a href="#cb1-1026" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on. Please note that if</span></span>
+<span id="cb1-1027"><a href="#cb1-1027" aria-hidden="true" tabindex="-1"></a><span class="co"># you set this to true, `padding_side` will be set to 'left' by default</span></span>
+<span id="cb1-1028"><a href="#cb1-1028" aria-hidden="true" tabindex="-1"></a><span class="fu">is_mistral_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1029"><a href="#cb1-1029" aria-hidden="true" tabindex="-1"></a><span class="co"># Internal use only - Used to identify which the model is based on</span></span>
+<span id="cb1-1030"><a href="#cb1-1030" aria-hidden="true" tabindex="-1"></a><span class="fu">is_qwen_derived_model</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-1031"><a href="#cb1-1031" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
-<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
-<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
-<span id="cb1-1036"><a href="#cb1-1036" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
-<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
-<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
-<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
-<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
-<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
-<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
-<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1049"><a href="#cb1-1049" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
-<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
-<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
-<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
-<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
-<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
-<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
-<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
-<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config'] | None</span></span>
-<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
-<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1032"><a href="#cb1-1032" aria-hidden="true" tabindex="-1"></a><span class="co"># Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available</span></span>
+<span id="cb1-1033"><a href="#cb1-1033" aria-hidden="true" tabindex="-1"></a><span class="co"># plugins or doc below for more details.</span></span>
+<span id="cb1-1034"><a href="#cb1-1034" aria-hidden="true" tabindex="-1"></a><span class="co"># https://docs.axolotl.ai/docs/custom_integrations.html</span></span>
+<span id="cb1-1035"><a href="#cb1-1035" aria-hidden="true" tabindex="-1"></a><span class="fu">plugins</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1036"><a href="#cb1-1036" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1037"><a href="#cb1-1037" aria-hidden="true" tabindex="-1"></a><span class="co"># This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This</span></span>
+<span id="cb1-1038"><a href="#cb1-1038" aria-hidden="true" tabindex="-1"></a><span class="co"># can also be a relative path to a model on disk</span></span>
+<span id="cb1-1039"><a href="#cb1-1039" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1040"><a href="#cb1-1040" aria-hidden="true" tabindex="-1"></a><span class="co"># If the base_model repo on hf hub doesn't include configuration .json files, You can</span></span>
+<span id="cb1-1041"><a href="#cb1-1041" aria-hidden="true" tabindex="-1"></a><span class="co"># set that here, or leave this empty to default to base_model</span></span>
+<span id="cb1-1042"><a href="#cb1-1042" aria-hidden="true" tabindex="-1"></a><span class="fu">base_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1043"><a href="#cb1-1043" aria-hidden="true" tabindex="-1"></a><span class="fu">cls_model_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1044"><a href="#cb1-1044" aria-hidden="true" tabindex="-1"></a><span class="co"># Optional tokenizer configuration path in case you want to use a different tokenizer</span></span>
+<span id="cb1-1045"><a href="#cb1-1045" aria-hidden="true" tabindex="-1"></a><span class="co"># than the one defined in the base model</span></span>
+<span id="cb1-1046"><a href="#cb1-1046" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_config</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1047"><a href="#cb1-1047" aria-hidden="true" tabindex="-1"></a><span class="co"># use_fast option for tokenizer loading from_pretrained, default to True</span></span>
+<span id="cb1-1048"><a href="#cb1-1048" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_fast</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1049"><a href="#cb1-1049" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use the legacy tokenizer setting, defaults to True</span></span>
+<span id="cb1-1050"><a href="#cb1-1050" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_legacy</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1051"><a href="#cb1-1051" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use mistral-common tokenizer. If set to True, it will use the mistral-</span></span>
+<span id="cb1-1052"><a href="#cb1-1052" aria-hidden="true" tabindex="-1"></a><span class="co"># common tokenizer.</span></span>
+<span id="cb1-1053"><a href="#cb1-1053" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_use_mistral_common</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1054"><a href="#cb1-1054" aria-hidden="true" tabindex="-1"></a><span class="co"># Corresponding tokenizer for the model AutoTokenizer is a good choice</span></span>
+<span id="cb1-1055"><a href="#cb1-1055" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1056"><a href="#cb1-1056" aria-hidden="true" tabindex="-1"></a><span class="co"># transformers processor class</span></span>
+<span id="cb1-1057"><a href="#cb1-1057" aria-hidden="true" tabindex="-1"></a><span class="fu">processor_type</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1058"><a href="#cb1-1058" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to save jinja files for tokenizer, transformers default is True</span></span>
+<span id="cb1-1059"><a href="#cb1-1059" aria-hidden="true" tabindex="-1"></a><span class="fu">tokenizer_save_jinja_files</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1060"><a href="#cb1-1060" aria-hidden="true" tabindex="-1"></a><span class="co"># Trust remote code for untrusted source</span></span>
+<span id="cb1-1061"><a href="#cb1-1061" aria-hidden="true" tabindex="-1"></a><span class="fu">trust_remote_code</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1062"><a href="#cb1-1062" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1063"><a href="#cb1-1063" aria-hidden="true" tabindex="-1"></a><span class="co"># Don't move the model to the device before sharding. Set to `false` to revert to legacy</span></span>
+<span id="cb1-1064"><a href="#cb1-1064" aria-hidden="true" tabindex="-1"></a><span class="co"># behavior.</span></span>
+<span id="cb1-1065"><a href="#cb1-1065" aria-hidden="true" tabindex="-1"></a><span class="fu">experimental_skip_move_to_device</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
+<span id="cb1-1066"><a href="#cb1-1066" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1067"><a href="#cb1-1067" aria-hidden="true" tabindex="-1"></a><span class="co"># Use custom kernels, e.g. MegaBlocks.</span></span>
+<span id="cb1-1068"><a href="#cb1-1068" aria-hidden="true" tabindex="-1"></a><span class="fu">use_kernels</span><span class="kw">:</span><span class="at"> bool | None</span></span>
 <span id="cb1-1069"><a href="#cb1-1069" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
-<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
-<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
-<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
-<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="co"># Save model as safetensors (require safetensors package). Default True</span></span>
-<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
-<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
-<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
-<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1070"><a href="#cb1-1070" aria-hidden="true" tabindex="-1"></a><span class="co"># Model loading quantization config</span></span>
+<span id="cb1-1071"><a href="#cb1-1071" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config</span><span class="kw">:</span><span class="at"> Literal['Mxfp4Config'] | None</span></span>
+<span id="cb1-1072"><a href="#cb1-1072" aria-hidden="true" tabindex="-1"></a><span class="co"># kwargs for model quantization config</span></span>
+<span id="cb1-1073"><a href="#cb1-1073" aria-hidden="true" tabindex="-1"></a><span class="fu">model_quantization_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1074"><a href="#cb1-1074" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1075"><a href="#cb1-1075" aria-hidden="true" tabindex="-1"></a><span class="co"># Where to save the full-finetuned model to</span></span>
+<span id="cb1-1076"><a href="#cb1-1076" aria-hidden="true" tabindex="-1"></a><span class="fu">output_dir</span><span class="kw">:</span><span class="at"> str = ./model-out</span></span>
+<span id="cb1-1077"><a href="#cb1-1077" aria-hidden="true" tabindex="-1"></a><span class="co"># push checkpoints to hub</span></span>
+<span id="cb1-1078"><a href="#cb1-1078" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_model_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1079"><a href="#cb1-1079" aria-hidden="true" tabindex="-1"></a><span class="co"># how to push checkpoints to hub</span></span>
+<span id="cb1-1080"><a href="#cb1-1080" aria-hidden="true" tabindex="-1"></a><span class="fu">hub_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1081"><a href="#cb1-1081" aria-hidden="true" tabindex="-1"></a><span class="co"># Save model as safetensors (require safetensors package). Default True</span></span>
+<span id="cb1-1082"><a href="#cb1-1082" aria-hidden="true" tabindex="-1"></a><span class="fu">save_safetensors</span><span class="kw">:</span><span class="at"> bool | None = True</span></span>
 <span id="cb1-1083"><a href="#cb1-1083" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora' or 'qlora' or leave blank to train all parameters in</span></span>
-<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a><span class="co"># original model</span></span>
-<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
-<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
-<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
-<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
-<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
-<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
-<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
-<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
-<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
-<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
-<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
-<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
-<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
-<span id="cb1-1108"><a href="#cb1-1108" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
-<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
-<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
-<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
-<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
-<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
-<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
-<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
-<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
-<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
-<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
-<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
-<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
-<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
-<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
-<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
-<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1131"><a href="#cb1-1131" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
-<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
-<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
-<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1138"><a href="#cb1-1138" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
-<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
+<span id="cb1-1084"><a href="#cb1-1084" aria-hidden="true" tabindex="-1"></a><span class="co"># This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer</span></span>
+<span id="cb1-1085"><a href="#cb1-1085" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_8bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1086"><a href="#cb1-1086" aria-hidden="true" tabindex="-1"></a><span class="co"># Use bitsandbytes 4 bit</span></span>
+<span id="cb1-1087"><a href="#cb1-1087" aria-hidden="true" tabindex="-1"></a><span class="fu">load_in_4bit</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1088"><a href="#cb1-1088" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1089"><a href="#cb1-1089" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to use 'lora' or 'qlora' or leave blank to train all parameters in</span></span>
+<span id="cb1-1090"><a href="#cb1-1090" aria-hidden="true" tabindex="-1"></a><span class="co"># original model</span></span>
+<span id="cb1-1091"><a href="#cb1-1091" aria-hidden="true" tabindex="-1"></a><span class="fu">adapter</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1092"><a href="#cb1-1092" aria-hidden="true" tabindex="-1"></a><span class="co"># If you already have a lora model trained that you want to load, put that here. This</span></span>
+<span id="cb1-1093"><a href="#cb1-1093" aria-hidden="true" tabindex="-1"></a><span class="co"># means after training, if you want to test the model, you should set this to the value</span></span>
+<span id="cb1-1094"><a href="#cb1-1094" aria-hidden="true" tabindex="-1"></a><span class="co"># of `output_dir`. Note that if you merge an adapter to the base model, a new</span></span>
+<span id="cb1-1095"><a href="#cb1-1095" aria-hidden="true" tabindex="-1"></a><span class="co"># subdirectory `merged` will be created under the `output_dir`.</span></span>
+<span id="cb1-1096"><a href="#cb1-1096" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_model_dir</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1097"><a href="#cb1-1097" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_r</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1098"><a href="#cb1-1098" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_alpha</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1099"><a href="#cb1-1099" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_fan_in_fan_out</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1100"><a href="#cb1-1100" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_modules</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1101"><a href="#cb1-1101" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_parameters</span><span class="kw">:</span><span class="at"> str | list[str] | None</span></span>
+<span id="cb1-1102"><a href="#cb1-1102" aria-hidden="true" tabindex="-1"></a><span class="co"># If true, will target all linear modules</span></span>
+<span id="cb1-1103"><a href="#cb1-1103" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_target_linear</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1104"><a href="#cb1-1104" aria-hidden="true" tabindex="-1"></a><span class="co"># If you added new tokens to the tokenizer, you may need to save some LoRA modules</span></span>
+<span id="cb1-1105"><a href="#cb1-1105" aria-hidden="true" tabindex="-1"></a><span class="co"># because they need to know the new tokens. For LLaMA and Mistral, you need to save</span></span>
+<span id="cb1-1106"><a href="#cb1-1106" aria-hidden="true" tabindex="-1"></a><span class="co"># `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts</span></span>
+<span id="cb1-1107"><a href="#cb1-1107" aria-hidden="true" tabindex="-1"></a><span class="co"># tokens to embeddings, and `lm_head` converts embeddings to token probabilities.</span></span>
+<span id="cb1-1108"><a href="#cb1-1108" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_modules_to_save</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1109"><a href="#cb1-1109" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_dropout</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1110"><a href="#cb1-1110" aria-hidden="true" tabindex="-1"></a><span class="co"># The layer indices to transform, otherwise, apply to all layers</span></span>
+<span id="cb1-1111"><a href="#cb1-1111" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_to_transform</span><span class="kw">:</span><span class="at"> list[int] | None</span></span>
+<span id="cb1-1112"><a href="#cb1-1112" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layers_pattern</span><span class="kw">:</span><span class="at"> list[str] | None</span></span>
+<span id="cb1-1113"><a href="#cb1-1113" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1114"><a href="#cb1-1114" aria-hidden="true" tabindex="-1"></a><span class="fu">peft</span><span class="kw">:</span><span class="at"> PeftConfig | None</span></span>
+<span id="cb1-1115"><a href="#cb1-1115" aria-hidden="true" tabindex="-1"></a><span class="co">  # For PeftConfig:</span></span>
+<span id="cb1-1116"><a href="#cb1-1116" aria-hidden="true" tabindex="-1"></a><span class="co">  # Configuration options for loftq initialization for LoRA</span></span>
+<span id="cb1-1117"><a href="#cb1-1117" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">loftq_config</span><span class="kw">:</span><span class="at"> LoftQConfig | None</span></span>
+<span id="cb1-1118"><a href="#cb1-1118" aria-hidden="true" tabindex="-1"></a><span class="co">    # For LoftQConfig:</span></span>
+<span id="cb1-1119"><a href="#cb1-1119" aria-hidden="true" tabindex="-1"></a><span class="co">    # typically 4 bits</span></span>
+<span id="cb1-1120"><a href="#cb1-1120" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">loftq_bits</span><span class="kw">:</span><span class="at"> int = 4</span></span>
+<span id="cb1-1121"><a href="#cb1-1121" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1122"><a href="#cb1-1122" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use DoRA.</span></span>
+<span id="cb1-1123"><a href="#cb1-1123" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_dora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1124"><a href="#cb1-1124" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use RSLoRA.</span></span>
+<span id="cb1-1125"><a href="#cb1-1125" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_use_rslora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1126"><a href="#cb1-1126" aria-hidden="true" tabindex="-1"></a><span class="co"># List of layer indices to replicate.</span></span>
+<span id="cb1-1127"><a href="#cb1-1127" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_layer_replication</span><span class="kw">:</span><span class="at"> list[tuple[int, int]] | None</span></span>
+<span id="cb1-1128"><a href="#cb1-1128" aria-hidden="true" tabindex="-1"></a><span class="co"># How to initialize LoRA weights. Default to True which is MS original implementation.</span></span>
+<span id="cb1-1129"><a href="#cb1-1129" aria-hidden="true" tabindex="-1"></a><span class="fu">peft_init_lora_weights</span><span class="kw">:</span><span class="at"> bool | str | None</span></span>
+<span id="cb1-1130"><a href="#cb1-1130" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1131"><a href="#cb1-1131" aria-hidden="true" tabindex="-1"></a><span class="co"># load qlora model in sharded format for FSDP using answer.ai technique.</span></span>
+<span id="cb1-1132"><a href="#cb1-1132" aria-hidden="true" tabindex="-1"></a><span class="fu">qlora_sharded_model_loading</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1133"><a href="#cb1-1133" aria-hidden="true" tabindex="-1"></a><span class="co"># Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it</span></span>
+<span id="cb1-1134"><a href="#cb1-1134" aria-hidden="true" tabindex="-1"></a><span class="co"># takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge</span></span>
+<span id="cb1-1135"><a href="#cb1-1135" aria-hidden="true" tabindex="-1"></a><span class="fu">lora_on_cpu</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1136"><a href="#cb1-1136" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether you are training a 4-bit GPTQ quantized model</span></span>
+<span id="cb1-1137"><a href="#cb1-1137" aria-hidden="true" tabindex="-1"></a><span class="fu">gptq</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1138"><a href="#cb1-1138" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the bnb 4bit quantization configuration</span></span>
+<span id="cb1-1139"><a href="#cb1-1139" aria-hidden="true" tabindex="-1"></a><span class="fu">bnb_config_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
 <span id="cb1-1140"><a href="#cb1-1140" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
-<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
-<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
-<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
-<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
-<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
-<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
-<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
-<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
-<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
-<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
-<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
-<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
-<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
-<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
-<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
-<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
-<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
-<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
-<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
-<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
-<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
-<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
-<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
-<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
-<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
-<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
-<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
-<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
-<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
-<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
-<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
-<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
-<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
-<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
-<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
-<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
-<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
-<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
-<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
-<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
-<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
-<span id="cb1-1211"><a href="#cb1-1211" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
-<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
-<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
-<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
-<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
-<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
-<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
-<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
-<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
-<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
-<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
-<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
-<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
-<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
-<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
-<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
-<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
-<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
-<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
-<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
-<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
-<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1256"><a href="#cb1-1256" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
-<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
-<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1261"><a href="#cb1-1261" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
-<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
-<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
-<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
-<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
-<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1272"><a href="#cb1-1272" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
-<span id="cb1-1273"><a href="#cb1-1273" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
-<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
-<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
-<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
-<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
-<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
-<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
-<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
-<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
-<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
-<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
-<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
-<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
-<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
-<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
+<span id="cb1-1141"><a href="#cb1-1141" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.</span></span>
+<span id="cb1-1142"><a href="#cb1-1142" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1143"><a href="#cb1-1143" aria-hidden="true" tabindex="-1"></a><span class="co"># loraplus learning rate for lora embedding layers. Default value is 1e-6.</span></span>
+<span id="cb1-1144"><a href="#cb1-1144" aria-hidden="true" tabindex="-1"></a><span class="fu">loraplus_lr_embedding</span><span class="kw">:</span><span class="at"> float | None = 1e-06</span></span>
+<span id="cb1-1145"><a href="#cb1-1145" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1146"><a href="#cb1-1146" aria-hidden="true" tabindex="-1"></a><span class="fu">merge_lora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1147"><a href="#cb1-1147" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1148"><a href="#cb1-1148" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to use ReLoRA. Use with jagged_restart_*steps options.</span></span>
+<span id="cb1-1149"><a href="#cb1-1149" aria-hidden="true" tabindex="-1"></a><span class="fu">relora</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1150"><a href="#cb1-1150" aria-hidden="true" tabindex="-1"></a><span class="co"># threshold for optimizer magnitude when pruning</span></span>
+<span id="cb1-1151"><a href="#cb1-1151" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_prune_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1152"><a href="#cb1-1152" aria-hidden="true" tabindex="-1"></a><span class="co"># True to perform lora weight merges on cpu during restarts, for modest gpu memory</span></span>
+<span id="cb1-1153"><a href="#cb1-1153" aria-hidden="true" tabindex="-1"></a><span class="co"># savings</span></span>
+<span id="cb1-1154"><a href="#cb1-1154" aria-hidden="true" tabindex="-1"></a><span class="fu">relora_cpu_offload</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1155"><a href="#cb1-1155" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1156"><a href="#cb1-1156" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to reset for jagged restarts</span></span>
+<span id="cb1-1157"><a href="#cb1-1157" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1158"><a href="#cb1-1158" aria-hidden="true" tabindex="-1"></a><span class="co"># how many warmup steps to take after reset for jagged restarts</span></span>
+<span id="cb1-1159"><a href="#cb1-1159" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_warmup_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1160"><a href="#cb1-1160" aria-hidden="true" tabindex="-1"></a><span class="co"># how many anneal steps to take before reset for jagged restarts</span></span>
+<span id="cb1-1161"><a href="#cb1-1161" aria-hidden="true" tabindex="-1"></a><span class="fu">jagged_restart_anneal_steps</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1162"><a href="#cb1-1162" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1163"><a href="#cb1-1163" aria-hidden="true" tabindex="-1"></a><span class="co"># If greater than 1, backpropagation will be skipped and the gradients will be</span></span>
+<span id="cb1-1164"><a href="#cb1-1164" aria-hidden="true" tabindex="-1"></a><span class="co"># accumulated for the given number of steps.</span></span>
+<span id="cb1-1165"><a href="#cb1-1165" aria-hidden="true" tabindex="-1"></a><span class="fu">gradient_accumulation_steps</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1166"><a href="#cb1-1166" aria-hidden="true" tabindex="-1"></a><span class="co"># The number of samples to include in each batch. This is the number of samples sent to</span></span>
+<span id="cb1-1167"><a href="#cb1-1167" aria-hidden="true" tabindex="-1"></a><span class="co"># each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps</span></span>
+<span id="cb1-1168"><a href="#cb1-1168" aria-hidden="true" tabindex="-1"></a><span class="fu">micro_batch_size</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1169"><a href="#cb1-1169" aria-hidden="true" tabindex="-1"></a><span class="co"># Total batch size, we do not recommended setting this manually</span></span>
+<span id="cb1-1170"><a href="#cb1-1170" aria-hidden="true" tabindex="-1"></a><span class="fu">batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1171"><a href="#cb1-1171" aria-hidden="true" tabindex="-1"></a><span class="co"># per gpu micro batch size for evals, defaults to value of micro_batch_size</span></span>
+<span id="cb1-1172"><a href="#cb1-1172" aria-hidden="true" tabindex="-1"></a><span class="fu">eval_batch_size</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1173"><a href="#cb1-1173" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1174"><a href="#cb1-1174" aria-hidden="true" tabindex="-1"></a><span class="co"># whether to find batch size that fits in memory. Passed to underlying transformers</span></span>
+<span id="cb1-1175"><a href="#cb1-1175" aria-hidden="true" tabindex="-1"></a><span class="co"># Trainer</span></span>
+<span id="cb1-1176"><a href="#cb1-1176" aria-hidden="true" tabindex="-1"></a><span class="fu">auto_find_batch_size</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1177"><a href="#cb1-1177" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1178"><a href="#cb1-1178" aria-hidden="true" tabindex="-1"></a><span class="co"># Whether to mask out or include the human's prompt from the training labels</span></span>
+<span id="cb1-1179"><a href="#cb1-1179" aria-hidden="true" tabindex="-1"></a><span class="fu">train_on_inputs</span><span class="kw">:</span><span class="at"> bool | None = False</span></span>
+<span id="cb1-1180"><a href="#cb1-1180" aria-hidden="true" tabindex="-1"></a><span class="co"># Group similarly sized data to minimize padding. May be slower to start, as it must</span></span>
+<span id="cb1-1181"><a href="#cb1-1181" aria-hidden="true" tabindex="-1"></a><span class="co"># download and sort the entire dataset. Note that training loss may have an oscillating</span></span>
+<span id="cb1-1182"><a href="#cb1-1182" aria-hidden="true" tabindex="-1"></a><span class="co"># pattern with this enabled.</span></span>
+<span id="cb1-1183"><a href="#cb1-1183" aria-hidden="true" tabindex="-1"></a><span class="fu">group_by_length</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1184"><a href="#cb1-1184" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1185"><a href="#cb1-1185" aria-hidden="true" tabindex="-1"></a><span class="fu">learning_rate</span><span class="kw">:</span><span class="at"> str | float (required)</span></span>
+<span id="cb1-1186"><a href="#cb1-1186" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1187"><a href="#cb1-1187" aria-hidden="true" tabindex="-1"></a><span class="fu">embedding_lr_scale</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1188"><a href="#cb1-1188" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify weight decay</span></span>
+<span id="cb1-1189"><a href="#cb1-1189" aria-hidden="true" tabindex="-1"></a><span class="fu">weight_decay</span><span class="kw">:</span><span class="at"> float | None = 0.0</span></span>
+<span id="cb1-1190"><a href="#cb1-1190" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify optimizer</span></span>
+<span id="cb1-1191"><a href="#cb1-1191" aria-hidden="true" tabindex="-1"></a><span class="fu">optimizer</span><span class="kw">:</span><span class="at"> OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED</span></span>
+<span id="cb1-1192"><a href="#cb1-1192" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary of arguments to pass to the optimizer</span></span>
+<span id="cb1-1193"><a href="#cb1-1193" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_args</span><span class="kw">:</span><span class="at"> str | dict[str, Any] | None</span></span>
+<span id="cb1-1194"><a href="#cb1-1194" aria-hidden="true" tabindex="-1"></a><span class="co"># The target modules to optimize, i.e. the module names that you would like to train,</span></span>
+<span id="cb1-1195"><a href="#cb1-1195" aria-hidden="true" tabindex="-1"></a><span class="co"># right now this is used only for GaLore algorithm</span></span>
+<span id="cb1-1196"><a href="#cb1-1196" aria-hidden="true" tabindex="-1"></a><span class="fu">optim_target_modules</span><span class="kw">:</span><span class="at"> list[str] | Literal['all_linear'] | None</span></span>
+<span id="cb1-1197"><a href="#cb1-1197" aria-hidden="true" tabindex="-1"></a><span class="co"># Path to torch distx for optim 'adamw_anyprecision'</span></span>
+<span id="cb1-1198"><a href="#cb1-1198" aria-hidden="true" tabindex="-1"></a><span class="fu">torchdistx_path</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1199"><a href="#cb1-1199" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler</span><span class="kw">:</span><span class="at"> SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE</span></span>
+<span id="cb1-1200"><a href="#cb1-1200" aria-hidden="true" tabindex="-1"></a><span class="co"># Specify a scheduler and kwargs to use with the optimizer</span></span>
+<span id="cb1-1201"><a href="#cb1-1201" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_scheduler_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1202"><a href="#cb1-1202" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_quadratic_warmup</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1203"><a href="#cb1-1203" aria-hidden="true" tabindex="-1"></a><span class="co"># decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of</span></span>
+<span id="cb1-1204"><a href="#cb1-1204" aria-hidden="true" tabindex="-1"></a><span class="co"># peak lr</span></span>
+<span id="cb1-1205"><a href="#cb1-1205" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_min_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1206"><a href="#cb1-1206" aria-hidden="true" tabindex="-1"></a><span class="co"># freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means</span></span>
+<span id="cb1-1207"><a href="#cb1-1207" aria-hidden="true" tabindex="-1"></a><span class="co"># start cosine_min_lr at 80% of training step</span></span>
+<span id="cb1-1208"><a href="#cb1-1208" aria-hidden="true" tabindex="-1"></a><span class="fu">cosine_constant_lr_ratio</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1209"><a href="#cb1-1209" aria-hidden="true" tabindex="-1"></a><span class="co"># Learning rate div factor</span></span>
+<span id="cb1-1210"><a href="#cb1-1210" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_div_factor</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1211"><a href="#cb1-1211" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1212"><a href="#cb1-1212" aria-hidden="true" tabindex="-1"></a><span class="fu">lr_groups</span><span class="kw">:</span><span class="at"> list[LrGroup] | None</span></span>
+<span id="cb1-1213"><a href="#cb1-1213" aria-hidden="true" tabindex="-1"></a><span class="co">  # For LrGroup:</span></span>
+<span id="cb1-1214"><a href="#cb1-1214" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">name</span><span class="kw">:</span><span class="at"> str (required)</span></span>
+<span id="cb1-1215"><a href="#cb1-1215" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">modules</span><span class="kw">:</span><span class="at"> list[str] (required)</span></span>
+<span id="cb1-1216"><a href="#cb1-1216" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="fu">lr</span><span class="kw">:</span><span class="at"> float (required)</span></span>
+<span id="cb1-1217"><a href="#cb1-1217" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1218"><a href="#cb1-1218" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1219"><a href="#cb1-1219" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1220"><a href="#cb1-1220" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1221"><a href="#cb1-1221" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_epsilon2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1222"><a href="#cb1-1222" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1223"><a href="#cb1-1223" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta1</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1224"><a href="#cb1-1224" aria-hidden="true" tabindex="-1"></a><span class="co"># adamw hyperparams</span></span>
+<span id="cb1-1225"><a href="#cb1-1225" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta2</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1226"><a href="#cb1-1226" aria-hidden="true" tabindex="-1"></a><span class="co"># only used for CAME Optimizer</span></span>
+<span id="cb1-1227"><a href="#cb1-1227" aria-hidden="true" tabindex="-1"></a><span class="fu">adam_beta3</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1228"><a href="#cb1-1228" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1229"><a href="#cb1-1229" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer learning rate</span></span>
+<span id="cb1-1230"><a href="#cb1-1230" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_lr</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1231"><a href="#cb1-1231" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer momentum</span></span>
+<span id="cb1-1232"><a href="#cb1-1232" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_momentum</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1233"><a href="#cb1-1233" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank</span></span>
+<span id="cb1-1234"><a href="#cb1-1234" aria-hidden="true" tabindex="-1"></a><span class="co"># dimension.</span></span>
+<span id="cb1-1235"><a href="#cb1-1235" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_fraction</span><span class="kw">:</span><span class="at"> float | None = 1.0</span></span>
+<span id="cb1-1236"><a href="#cb1-1236" aria-hidden="true" tabindex="-1"></a><span class="co"># Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may</span></span>
+<span id="cb1-1237"><a href="#cb1-1237" aria-hidden="true" tabindex="-1"></a><span class="co"># be useful to ensure even sharding.</span></span>
+<span id="cb1-1238"><a href="#cb1-1238" aria-hidden="true" tabindex="-1"></a><span class="fu">dion_rank_multiple_of</span><span class="kw">:</span><span class="at"> int | None = 1</span></span>
+<span id="cb1-1239"><a href="#cb1-1239" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1240"><a href="#cb1-1240" aria-hidden="true" tabindex="-1"></a><span class="co"># Gradient clipping max norm</span></span>
+<span id="cb1-1241"><a href="#cb1-1241" aria-hidden="true" tabindex="-1"></a><span class="fu">max_grad_norm</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1242"><a href="#cb1-1242" aria-hidden="true" tabindex="-1"></a><span class="fu">num_epochs</span><span class="kw">:</span><span class="at"> float = 1.0</span></span>
+<span id="cb1-1243"><a href="#cb1-1243" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1244"><a href="#cb1-1244" aria-hidden="true" tabindex="-1"></a><span class="fu">use_wandb</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1245"><a href="#cb1-1245" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the name of your wandb run</span></span>
+<span id="cb1-1246"><a href="#cb1-1246" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1247"><a href="#cb1-1247" aria-hidden="true" tabindex="-1"></a><span class="co"># Set the ID of your wandb run</span></span>
+<span id="cb1-1248"><a href="#cb1-1248" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_run_id</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1249"><a href="#cb1-1249" aria-hidden="true" tabindex="-1"></a><span class="co"># "offline" to save run metadata locally and not sync to the server, "disabled" to turn</span></span>
+<span id="cb1-1250"><a href="#cb1-1250" aria-hidden="true" tabindex="-1"></a><span class="co"># off wandb</span></span>
+<span id="cb1-1251"><a href="#cb1-1251" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1252"><a href="#cb1-1252" aria-hidden="true" tabindex="-1"></a><span class="co"># Your wandb project name</span></span>
+<span id="cb1-1253"><a href="#cb1-1253" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_project</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1254"><a href="#cb1-1254" aria-hidden="true" tabindex="-1"></a><span class="co"># A wandb Team name if using a Team</span></span>
+<span id="cb1-1255"><a href="#cb1-1255" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_entity</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1256"><a href="#cb1-1256" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_watch</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1257"><a href="#cb1-1257" aria-hidden="true" tabindex="-1"></a><span class="co"># "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only</span></span>
+<span id="cb1-1258"><a href="#cb1-1258" aria-hidden="true" tabindex="-1"></a><span class="co"># at the end of training</span></span>
+<span id="cb1-1259"><a href="#cb1-1259" aria-hidden="true" tabindex="-1"></a><span class="fu">wandb_log_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1260"><a href="#cb1-1260" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1261"><a href="#cb1-1261" aria-hidden="true" tabindex="-1"></a><span class="fu">use_mlflow</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1262"><a href="#cb1-1262" aria-hidden="true" tabindex="-1"></a><span class="co"># URI to mlflow</span></span>
+<span id="cb1-1263"><a href="#cb1-1263" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_tracking_uri</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1264"><a href="#cb1-1264" aria-hidden="true" tabindex="-1"></a><span class="co"># Your experiment name</span></span>
+<span id="cb1-1265"><a href="#cb1-1265" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_experiment_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1266"><a href="#cb1-1266" aria-hidden="true" tabindex="-1"></a><span class="co"># Your run name</span></span>
+<span id="cb1-1267"><a href="#cb1-1267" aria-hidden="true" tabindex="-1"></a><span class="fu">mlflow_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1268"><a href="#cb1-1268" aria-hidden="true" tabindex="-1"></a><span class="co"># set to true to copy each saved checkpoint on each save to mlflow artifact registry</span></span>
+<span id="cb1-1269"><a href="#cb1-1269" aria-hidden="true" tabindex="-1"></a><span class="fu">hf_mlflow_log_artifacts</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1270"><a href="#cb1-1270" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1271"><a href="#cb1-1271" aria-hidden="true" tabindex="-1"></a><span class="co"># Enable or disable Comet integration.</span></span>
+<span id="cb1-1272"><a href="#cb1-1272" aria-hidden="true" tabindex="-1"></a><span class="fu">use_comet</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1273"><a href="#cb1-1273" aria-hidden="true" tabindex="-1"></a><span class="co"># API key for Comet. Recommended to set via `comet login`.</span></span>
+<span id="cb1-1274"><a href="#cb1-1274" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_api_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1275"><a href="#cb1-1275" aria-hidden="true" tabindex="-1"></a><span class="co"># Workspace name in Comet. Defaults to the user's default workspace.</span></span>
+<span id="cb1-1276"><a href="#cb1-1276" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_workspace</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1277"><a href="#cb1-1277" aria-hidden="true" tabindex="-1"></a><span class="co"># Project name in Comet. Defaults to Uncategorized.</span></span>
+<span id="cb1-1278"><a href="#cb1-1278" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_project_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1279"><a href="#cb1-1279" aria-hidden="true" tabindex="-1"></a><span class="co"># Identifier for the experiment. Used to append data to an existing experiment or</span></span>
+<span id="cb1-1280"><a href="#cb1-1280" aria-hidden="true" tabindex="-1"></a><span class="co"># control the key of new experiments. Default to a random key.</span></span>
+<span id="cb1-1281"><a href="#cb1-1281" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_key</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1282"><a href="#cb1-1282" aria-hidden="true" tabindex="-1"></a><span class="co"># Create a new experiment ("create") or log to an existing one ("get"). Default</span></span>
+<span id="cb1-1283"><a href="#cb1-1283" aria-hidden="true" tabindex="-1"></a><span class="co"># ("get_or_create") auto-selects based on configuration.</span></span>
+<span id="cb1-1284"><a href="#cb1-1284" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_mode</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1285"><a href="#cb1-1285" aria-hidden="true" tabindex="-1"></a><span class="co"># Set to True to log data to Comet server, or False for offline storage. Default is</span></span>
+<span id="cb1-1286"><a href="#cb1-1286" aria-hidden="true" tabindex="-1"></a><span class="co"># True.</span></span>
+<span id="cb1-1287"><a href="#cb1-1287" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_online</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1288"><a href="#cb1-1288" aria-hidden="true" tabindex="-1"></a><span class="co"># Dictionary for additional configuration settings, see the doc for more details.</span></span>
+<span id="cb1-1289"><a href="#cb1-1289" aria-hidden="true" tabindex="-1"></a><span class="fu">comet_experiment_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1290"><a href="#cb1-1290" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1291"><a href="#cb1-1291" aria-hidden="true" tabindex="-1"></a><span class="co"># the number of activate layers in LISA</span></span>
+<span id="cb1-1292"><a href="#cb1-1292" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_n_layers</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1293"><a href="#cb1-1293" aria-hidden="true" tabindex="-1"></a><span class="co"># how often to switch layers in LISA</span></span>
+<span id="cb1-1294"><a href="#cb1-1294" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_step_interval</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1295"><a href="#cb1-1295" aria-hidden="true" tabindex="-1"></a><span class="co"># path under the model to access the layers</span></span>
+<span id="cb1-1296"><a href="#cb1-1296" aria-hidden="true" tabindex="-1"></a><span class="fu">lisa_layers_attribute</span><span class="kw">:</span><span class="at"> str | None = model.layers</span></span>
+<span id="cb1-1297"><a href="#cb1-1297" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1298"><a href="#cb1-1298" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_title</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1299"><a href="#cb1-1299" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_share</span><span class="kw">:</span><span class="at"> bool | None</span></span>
+<span id="cb1-1300"><a href="#cb1-1300" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1301"><a href="#cb1-1301" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_server_port</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1302"><a href="#cb1-1302" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_max_new_tokens</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1303"><a href="#cb1-1303" aria-hidden="true" tabindex="-1"></a><span class="fu">gradio_temperature</span><span class="kw">:</span><span class="at"> float | None</span></span>
 <span id="cb1-1304"><a href="#cb1-1304" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
-<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
-<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
-<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
-<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
-<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
-<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
-<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
-<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
-<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
-<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
-<span id="cb1-1318"><a href="#cb1-1318" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
-<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
-<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
-<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
-<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
-<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
-<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb1-1305"><a href="#cb1-1305" aria-hidden="true" tabindex="-1"></a><span class="fu">use_ray</span><span class="kw">:</span><span class="at"> bool = False</span></span>
+<span id="cb1-1306"><a href="#cb1-1306" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_run_name</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1307"><a href="#cb1-1307" aria-hidden="true" tabindex="-1"></a><span class="fu">ray_num_workers</span><span class="kw">:</span><span class="at"> int = 1</span></span>
+<span id="cb1-1308"><a href="#cb1-1308" aria-hidden="true" tabindex="-1"></a><span class="fu">resources_per_worker</span><span class="kw">:</span><span class="at"> dict</span></span>
+<span id="cb1-1309"><a href="#cb1-1309" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1310"><a href="#cb1-1310" aria-hidden="true" tabindex="-1"></a><span class="co"># The size of the image to resize to. It can be an integer (resized into padded-square</span></span>
+<span id="cb1-1311"><a href="#cb1-1311" aria-hidden="true" tabindex="-1"></a><span class="co"># image) or a tuple (width, height).If not provided, we will attempt to load from</span></span>
+<span id="cb1-1312"><a href="#cb1-1312" aria-hidden="true" tabindex="-1"></a><span class="co"># preprocessor.size, otherwise, images won't be resized.</span></span>
+<span id="cb1-1313"><a href="#cb1-1313" aria-hidden="true" tabindex="-1"></a><span class="fu">image_size</span><span class="kw">:</span><span class="at"> int | tuple[int, int] | None</span></span>
+<span id="cb1-1314"><a href="#cb1-1314" aria-hidden="true" tabindex="-1"></a><span class="co"># The resampling algorithm to use for image resizing. Default is bilinear. Please refer</span></span>
+<span id="cb1-1315"><a href="#cb1-1315" aria-hidden="true" tabindex="-1"></a><span class="co"># to PIL.Image.Resampling for more details.</span></span>
+<span id="cb1-1316"><a href="#cb1-1316" aria-hidden="true" tabindex="-1"></a><span class="fu">image_resize_algorithm</span><span class="kw">:</span><span class="at"> Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None</span></span>
+<span id="cb1-1317"><a href="#cb1-1317" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1318"><a href="#cb1-1318" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides to the base model configuration</span></span>
+<span id="cb1-1319"><a href="#cb1-1319" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_config</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1320"><a href="#cb1-1320" aria-hidden="true" tabindex="-1"></a><span class="co"># optional overrides the base model loading from_pretrained</span></span>
+<span id="cb1-1321"><a href="#cb1-1321" aria-hidden="true" tabindex="-1"></a><span class="fu">overrides_of_model_kwargs</span><span class="kw">:</span><span class="at"> dict[str, Any] | None</span></span>
+<span id="cb1-1322"><a href="#cb1-1322" aria-hidden="true" tabindex="-1"></a><span class="co"># If you want to specify the type of model to load, AutoModelForCausalLM is a good</span></span>
+<span id="cb1-1323"><a href="#cb1-1323" aria-hidden="true" tabindex="-1"></a><span class="co"># choice too</span></span>
+<span id="cb1-1324"><a href="#cb1-1324" aria-hidden="true" tabindex="-1"></a><span class="fu">type_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1325"><a href="#cb1-1325" aria-hidden="true" tabindex="-1"></a><span class="co"># You can specify to choose a specific model revision from huggingface hub</span></span>
+<span id="cb1-1326"><a href="#cb1-1326" aria-hidden="true" tabindex="-1"></a><span class="fu">revision_of_model</span><span class="kw">:</span><span class="at"> str | None</span></span>
+<span id="cb1-1327"><a href="#cb1-1327" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-1328"><a href="#cb1-1328" aria-hidden="true" tabindex="-1"></a><span class="fu">max_packed_sequence_len</span><span class="kw">:</span><span class="at"> int | None</span></span>
+<span id="cb1-1329"><a href="#cb1-1329" aria-hidden="true" tabindex="-1"></a><span class="fu">rope_scaling</span><span class="kw">:</span><span class="at"> Any | None</span></span>
+<span id="cb1-1330"><a href="#cb1-1330" aria-hidden="true" tabindex="-1"></a><span class="fu">noisy_embedding_alpha</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1331"><a href="#cb1-1331" aria-hidden="true" tabindex="-1"></a><span class="fu">dpo_beta</span><span class="kw">:</span><span class="at"> float | None</span></span>
+<span id="cb1-1332"><a href="#cb1-1332" aria-hidden="true" tabindex="-1"></a><span class="fu">evaluation_strategy</span><span class="kw">:</span><span class="at"> str | None</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 
 
 
diff --git a/docs/custom_integrations.html b/docs/custom_integrations.html
index adc66bb1f..e82219414 100644
--- a/docs/custom_integrations.html
+++ b/docs/custom_integrations.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html
index 7805ab085..a02cdce43 100644
--- a/docs/dataset-formats/conversation.html
+++ b/docs/dataset-formats/conversation.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html
index 3e8be0639..505eb5665 100644
--- a/docs/dataset-formats/index.html
+++ b/docs/dataset-formats/index.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/inst_tune.html b/docs/dataset-formats/inst_tune.html
index ade265ab1..dda442d69 100644
--- a/docs/dataset-formats/inst_tune.html
+++ b/docs/dataset-formats/inst_tune.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html
index 1def84898..64467edb1 100644
--- a/docs/dataset-formats/pretraining.html
+++ b/docs/dataset-formats/pretraining.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/stepwise_supervised.html b/docs/dataset-formats/stepwise_supervised.html
index 02cfa8148..f352872f6 100644
--- a/docs/dataset-formats/stepwise_supervised.html
+++ b/docs/dataset-formats/stepwise_supervised.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/template_free.html b/docs/dataset-formats/template_free.html
index bdae0eae8..c8b83fd76 100644
--- a/docs/dataset-formats/template_free.html
+++ b/docs/dataset-formats/template_free.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset-formats/tokenized.html b/docs/dataset-formats/tokenized.html
index 4faf5b14a..e02814cb2 100644
--- a/docs/dataset-formats/tokenized.html
+++ b/docs/dataset-formats/tokenized.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset_loading.html b/docs/dataset_loading.html
index 8284c5905..b44c5fa90 100644
--- a/docs/dataset_loading.html
+++ b/docs/dataset_loading.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/dataset_preprocessing.html b/docs/dataset_preprocessing.html
index 938d566b7..d8b06e9a9 100644
--- a/docs/dataset_preprocessing.html
+++ b/docs/dataset_preprocessing.html
@@ -344,6 +344,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link active">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/debugging.html b/docs/debugging.html
index ade286432..6c7aab987 100644
--- a/docs/debugging.html
+++ b/docs/debugging.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/docker.html b/docs/docker.html
index 2400d56b1..05cd6e9b3 100644
--- a/docs/docker.html
+++ b/docs/docker.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/faq.html b/docs/faq.html
index c6304f4e9..a6fe5a279 100644
--- a/docs/faq.html
+++ b/docs/faq.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/fsdp_qlora.html b/docs/fsdp_qlora.html
index 3bba1607e..c4050724f 100644
--- a/docs/fsdp_qlora.html
+++ b/docs/fsdp_qlora.html
@@ -344,6 +344,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/getting-started.html b/docs/getting-started.html
index a392cb764..ff9c46e36 100644
--- a/docs/getting-started.html
+++ b/docs/getting-started.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/gradient_checkpointing.html b/docs/gradient_checkpointing.html
index 2b8f37be5..218a2a10c 100644
--- a/docs/gradient_checkpointing.html
+++ b/docs/gradient_checkpointing.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/inference.html b/docs/inference.html
index 45f2cc9ac..abf01e685 100644
--- a/docs/inference.html
+++ b/docs/inference.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/input_output.html b/docs/input_output.html
index 97d98cf84..7dc72d7f2 100644
--- a/docs/input_output.html
+++ b/docs/input_output.html
@@ -344,6 +344,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/installation.html b/docs/installation.html
index 1656148b0..dad9957f4 100644
--- a/docs/installation.html
+++ b/docs/installation.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/lora_optims.html b/docs/lora_optims.html
index 437c79f3c..46d02e3e3 100644
--- a/docs/lora_optims.html
+++ b/docs/lora_optims.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/lr_groups.html b/docs/lr_groups.html
index da6d7a2b8..ac38d9f88 100644
--- a/docs/lr_groups.html
+++ b/docs/lr_groups.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/mac.html b/docs/mac.html
index 5a8596ac2..1a0fd35f2 100644
--- a/docs/mac.html
+++ b/docs/mac.html
@@ -344,6 +344,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/mixed_precision.html b/docs/mixed_precision.html
index f23591b98..2e4c01056 100644
--- a/docs/mixed_precision.html
+++ b/docs/mixed_precision.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/multi-gpu.html b/docs/multi-gpu.html
index f4f9e4885..9c110ec36 100644
--- a/docs/multi-gpu.html
+++ b/docs/multi-gpu.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/multi-node.html b/docs/multi-node.html
index 3f9964f1f..a4fd61d25 100644
--- a/docs/multi-node.html
+++ b/docs/multi-node.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/multimodal.html b/docs/multimodal.html
index 18cb894a7..e67af72e2 100644
--- a/docs/multimodal.html
+++ b/docs/multimodal.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/multipack.html b/docs/multipack.html
index dbfd78c58..0b69c390e 100644
--- a/docs/multipack.html
+++ b/docs/multipack.html
@@ -344,6 +344,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/nccl.html b/docs/nccl.html
index 93c681564..dfea2a6ee 100644
--- a/docs/nccl.html
+++ b/docs/nccl.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/nd_parallelism.html b/docs/nd_parallelism.html
index 522e07cc9..3a90c9886 100644
--- a/docs/nd_parallelism.html
+++ b/docs/nd_parallelism.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/optimizers.html b/docs/optimizers.html
index 126c3ddae..f9ea3a96c 100644
--- a/docs/optimizers.html
+++ b/docs/optimizers.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/qat.html b/docs/qat.html
index 94169c27d..acf29b0f2 100644
--- a/docs/qat.html
+++ b/docs/qat.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/quantize.html b/docs/quantize.html
index 3a0edee1f..ecc611208 100644
--- a/docs/quantize.html
+++ b/docs/quantize.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/ray-integration.html b/docs/ray-integration.html
index 4cdc83a2e..7c4168f59 100644
--- a/docs/ray-integration.html
+++ b/docs/ray-integration.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/reward_modelling.html b/docs/reward_modelling.html
index c86c86c70..f4f15a0e1 100644
--- a/docs/reward_modelling.html
+++ b/docs/reward_modelling.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/rlhf.html b/docs/rlhf.html
index e50d1a594..d55f838a8 100644
--- a/docs/rlhf.html
+++ b/docs/rlhf.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/sequence_parallelism.html b/docs/sequence_parallelism.html
index e51a4d09a..27785025f 100644
--- a/docs/sequence_parallelism.html
+++ b/docs/sequence_parallelism.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/streaming.html b/docs/streaming.html
new file mode 100644
index 000000000..e47fdf396
--- /dev/null
+++ b/docs/streaming.html
@@ -0,0 +1,1073 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en"><head>
+
+<meta charset="utf-8">
+<meta name="generator" content="quarto-1.7.34">
+
+<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes">
+
+<meta name="description" content="How to use streaming mode for large-scale datasets and memory-efficient training">
+
+<title>Streaming Datasets – Axolotl</title>
+<style>
+code{white-space: pre-wrap;}
+span.smallcaps{font-variant: small-caps;}
+div.columns{display: flex; gap: min(4vw, 1.5em);}
+div.column{flex: auto; overflow-x: auto;}
+div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
+ul.task-list{list-style: none;}
+ul.task-list li input[type="checkbox"] {
+  width: 0.8em;
+  margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ 
+  vertical-align: middle;
+}
+/* CSS for syntax highlighting */
+html { -webkit-text-size-adjust: 100%; }
+pre > code.sourceCode { white-space: pre; position: relative; }
+pre > code.sourceCode > span { display: inline-block; line-height: 1.25; }
+pre > code.sourceCode > span:empty { height: 1.2em; }
+.sourceCode { overflow: visible; }
+code.sourceCode > span { color: inherit; text-decoration: inherit; }
+div.sourceCode { margin: 1em 0; }
+pre.sourceCode { margin: 0; }
+@media screen {
+div.sourceCode { overflow: auto; }
+}
+@media print {
+pre > code.sourceCode { white-space: pre-wrap; }
+pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; }
+}
+pre.numberSource code
+  { counter-reset: source-line 0; }
+pre.numberSource code > span
+  { position: relative; left: -4em; counter-increment: source-line; }
+pre.numberSource code > span > a:first-child::before
+  { content: counter(source-line);
+    position: relative; left: -1em; text-align: right; vertical-align: baseline;
+    border: none; display: inline-block;
+    -webkit-touch-callout: none; -webkit-user-select: none;
+    -khtml-user-select: none; -moz-user-select: none;
+    -ms-user-select: none; user-select: none;
+    padding: 0 4px; width: 4em;
+  }
+pre.numberSource { margin-left: 3em;  padding-left: 4px; }
+div.sourceCode
+  {   }
+@media screen {
+pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; }
+}
+</style>
+
+
+<script src="../site_libs/quarto-nav/quarto-nav.js"></script>
+<script src="../site_libs/clipboard/clipboard.min.js"></script>
+<script src="../site_libs/quarto-search/autocomplete.umd.js"></script>
+<script src="../site_libs/quarto-search/fuse.min.js"></script>
+<script src="../site_libs/quarto-search/quarto-search.js"></script>
+<meta name="quarto:offset" content="../">
+<link href="../favicon.jpg" rel="icon" type="image/jpeg">
+<script src="../site_libs/quarto-html/quarto.js" type="module"></script>
+<script src="../site_libs/quarto-html/tabsets/tabsets.js" type="module"></script>
+<script src="../site_libs/quarto-html/popper.min.js"></script>
+<script src="../site_libs/quarto-html/tippy.umd.min.js"></script>
+<script src="../site_libs/quarto-html/anchor.min.js"></script>
+<link href="../site_libs/quarto-html/tippy.css" rel="stylesheet">
+<link href="../site_libs/quarto-html/quarto-syntax-highlighting-dark-befe23ebd2f54d8af2c8a89d1a1611f1.css" rel="stylesheet" id="quarto-text-highlighting-styles">
+<script src="../site_libs/bootstrap/bootstrap.min.js"></script>
+<link href="../site_libs/bootstrap/bootstrap-icons.css" rel="stylesheet">
+<link href="../site_libs/bootstrap/bootstrap-e9895ec3143e9833a687747e8d39d226.min.css" rel="stylesheet" append-hash="true" id="quarto-bootstrap" data-mode="dark">
+<script id="quarto-search-options" type="application/json">{
+  "location": "navbar",
+  "copy-button": false,
+  "collapse-after": 3,
+  "panel-placement": "end",
+  "type": "overlay",
+  "limit": 50,
+  "keyboard-shortcut": [
+    "f",
+    "/",
+    "s"
+  ],
+  "show-item-context": false,
+  "language": {
+    "search-no-results-text": "No results",
+    "search-matching-documents-text": "matching documents",
+    "search-copy-link-title": "Copy link to search",
+    "search-hide-matches-text": "Hide additional matches",
+    "search-more-match-text": "more match in this document",
+    "search-more-matches-text": "more matches in this document",
+    "search-clear-button-title": "Clear",
+    "search-text-placeholder": "",
+    "search-detached-cancel-button-title": "Cancel",
+    "search-submit-button-title": "Submit",
+    "search-label": "Search"
+  }
+}</script>
+<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-9KYCVJBNMQ"></script>
+
+<script type="text/javascript">
+
+window.dataLayer = window.dataLayer || [];
+function gtag(){dataLayer.push(arguments);}
+gtag('js', new Date());
+gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
+</script>
+
+
+<link rel="stylesheet" href="../styles.css">
+</head>
+
+<body class="nav-sidebar docked nav-fixed quarto-light">
+
+<div id="quarto-search-results"></div>
+  <header id="quarto-header" class="headroom fixed-top">
+    <nav class="navbar navbar-expand " data-bs-theme="dark">
+      <div class="navbar-container container-fluid">
+      <div class="navbar-brand-container mx-auto">
+    <a href="../index.html" class="navbar-brand navbar-brand-logo">
+    <img src="../image/axolotl_logo_digital_white.svg" alt="" class="navbar-logo">
+    </a>
+  </div>
+        <div class="quarto-navbar-tools tools-wide tools-end">
+    <a href="https://twitter.com/axolotl_ai" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-twitter"></i></a>
+    <a href="https://github.com/axolotl-ai-cloud/axolotl/" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-github"></i></a>
+    <a href="https://discord.gg/7m9sfhzaf3" title="" class="quarto-navigation-tool px-1" aria-label=""><i class="bi bi-discord"></i></a>
+</div>
+          <div id="quarto-search" class="" title="Search"></div>
+      </div> <!-- /container-fluid -->
+    </nav>
+  <nav class="quarto-secondary-nav">
+    <div class="container-fluid d-flex">
+      <button type="button" class="quarto-btn-toggle btn" data-bs-toggle="collapse" role="button" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">
+        <i class="bi bi-layout-text-sidebar-reverse"></i>
+      </button>
+        <nav class="quarto-page-breadcrumbs" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/batch_vs_grad.html">Core Concepts</a></li><li class="breadcrumb-item"><a href="../docs/streaming.html">Streaming Datasets</a></li></ol></nav>
+        <a class="flex-grow-1" role="navigation" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item" aria-controls="quarto-sidebar" aria-expanded="false" aria-label="Toggle sidebar navigation" onclick="if (window.quartoToggleHeadroom) { window.quartoToggleHeadroom(); }">      
+        </a>
+    </div>
+  </nav>
+</header>
+<!-- content -->
+<div id="quarto-content" class="quarto-container page-columns page-rows-contents page-layout-article page-navbar">
+<!-- sidebar -->
+  <nav id="quarto-sidebar" class="sidebar collapse collapse-horizontal quarto-sidebar-collapse-item sidebar-navigation docked overflow-auto">
+    <div class="sidebar-menu-container"> 
+    <ul class="list-unstyled mt-1">
+        <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Home</span></a>
+  </div>
+</li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true">
+ <span class="menu-text">Getting Started</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-1" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-1" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/getting-started.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quickstart</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/installation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Installation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/inference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Inference and Merging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/cli.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Command Line Interface (CLI)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/config-reference.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Config Reference</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/api" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">API Reference</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a href="../docs/dataset-formats/index.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Formats</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-2" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-2" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/pretraining.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Pre-training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/inst_tune.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Instruction Tuning</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/conversation.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Conversation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/stepwise_supervised.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Stepwise Supervised Format</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/template_free.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Template-Free</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset-formats/tokenized.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Pre-Tokenized Dataset</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true">
+ <span class="menu-text">Deployments</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-3" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-3" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/docker.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Docker</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multi-gpu.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi-GPU</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multi-node.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multi Node</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/ray-integration.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Ray Train</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/amd_hpc.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">AMD GPUs on HPC Systems</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/mac.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mac M-series</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true">
+ <span class="menu-text">How To Guides</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-4" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-4" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multimodal.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">MultiModal / Vision Language Models (BETA)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/rlhf.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">RLHF (Beta)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/reward_modelling.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Reward Modelling</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/lr_groups.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Learning Rate Groups</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/lora_optims.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">LoRA Optimizations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset_loading.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Loading</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/qat.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quantization Aware Training (QAT)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/quantize.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Quantization with torchao</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true">
+ <span class="menu-text">Core Concepts</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-5" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-5" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/batch_vs_grad.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Batch size vs Gradient accumulation</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Dataset Preprocessing</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link active">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/multipack.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Multipack (Sample Packing)</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/mixed_precision.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Mixed Precision Training</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/optimizers.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Optimizers</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true">
+ <span class="menu-text">Advanced Features</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-6" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-6" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/fsdp_qlora.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FDSP + QLoRA</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/unsloth.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Unsloth</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/torchao.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">PyTorch ao</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/custom_integrations.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Custom Integrations</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/sequence_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Sequence Parallelism</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/gradient_checkpointing.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Gradient Checkpointing and Activation Offloading</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nd_parallelism.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">N-D Parallelism (Beta)</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+        <li class="sidebar-item sidebar-item-section">
+      <div class="sidebar-item-container"> 
+            <a class="sidebar-item-text sidebar-link text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true">
+ <span class="menu-text">Troubleshooting</span></a>
+          <a class="sidebar-item-toggle text-start" data-bs-toggle="collapse" data-bs-target="#quarto-sidebar-section-7" role="navigation" aria-expanded="true" aria-label="Toggle section">
+            <i class="bi bi-chevron-right ms-2"></i>
+          </a> 
+      </div>
+      <ul id="quarto-sidebar-section-7" class="collapse list-unstyled sidebar-section depth1 show">  
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/faq.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">FAQ</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/debugging.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Debugging</span></a>
+  </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/nccl.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">NCCL</span></a>
+  </div>
+</li>
+      </ul>
+  </li>
+    </ul>
+    </div>
+</nav>
+<div id="quarto-sidebar-glass" class="quarto-sidebar-collapse-item" data-bs-toggle="collapse" data-bs-target=".quarto-sidebar-collapse-item"></div>
+<!-- margin-sidebar -->
+    <div id="quarto-margin-sidebar" class="sidebar margin-sidebar">
+        <nav id="TOC" role="doc-toc" class="toc-active">
+    <h2 id="toc-title">On this page</h2>
+   
+  <ul>
+  <li><a href="#configuration" id="toc-configuration" class="nav-link active" data-scroll-target="#configuration">Configuration</a>
+  <ul class="collapse">
+  <li><a href="#basic-streaming" id="toc-basic-streaming" class="nav-link" data-scroll-target="#basic-streaming">Basic Streaming</a></li>
+  <li><a href="#pretraining-with-streaming" id="toc-pretraining-with-streaming" class="nav-link" data-scroll-target="#pretraining-with-streaming">Pretraining with Streaming</a></li>
+  <li><a href="#sft-with-streaming" id="toc-sft-with-streaming" class="nav-link" data-scroll-target="#sft-with-streaming">SFT with Streaming</a></li>
+  </ul></li>
+  <li><a href="#configuration-options" id="toc-configuration-options" class="nav-link" data-scroll-target="#configuration-options">Configuration Options</a>
+  <ul class="collapse">
+  <li><a href="#streaming_multipack_buffer_size" id="toc-streaming_multipack_buffer_size" class="nav-link" data-scroll-target="#streaming_multipack_buffer_size"><code>streaming_multipack_buffer_size</code></a></li>
+  <li><a href="#shuffle_merged_datasets" id="toc-shuffle_merged_datasets" class="nav-link" data-scroll-target="#shuffle_merged_datasets"><code>shuffle_merged_datasets</code></a></li>
+  </ul></li>
+  <li><a href="#sample-packing-with-streaming" id="toc-sample-packing-with-streaming" class="nav-link" data-scroll-target="#sample-packing-with-streaming">Sample Packing with Streaming</a></li>
+  <li><a href="#important-considerations" id="toc-important-considerations" class="nav-link" data-scroll-target="#important-considerations">Important Considerations</a>
+  <ul class="collapse">
+  <li><a href="#memory-usage" id="toc-memory-usage" class="nav-link" data-scroll-target="#memory-usage">Memory Usage</a></li>
+  <li><a href="#performance" id="toc-performance" class="nav-link" data-scroll-target="#performance">Performance</a></li>
+  <li><a href="#evaluation-datasets" id="toc-evaluation-datasets" class="nav-link" data-scroll-target="#evaluation-datasets">Evaluation Datasets</a></li>
+  </ul></li>
+  <li><a href="#examples" id="toc-examples" class="nav-link" data-scroll-target="#examples">Examples</a></li>
+  </ul>
+</nav>
+    </div>
+<!-- main -->
+<main class="content" id="quarto-document-content">
+
+<header id="title-block-header" class="quarto-title-block default"><nav class="quarto-page-breadcrumbs quarto-title-breadcrumbs d-none d-lg-block" aria-label="breadcrumb"><ol class="breadcrumb"><li class="breadcrumb-item"><a href="../docs/batch_vs_grad.html">Core Concepts</a></li><li class="breadcrumb-item"><a href="../docs/streaming.html">Streaming Datasets</a></li></ol></nav>
+<div class="quarto-title">
+<h1 class="title">Streaming Datasets</h1>
+</div>
+
+<div>
+  <div class="description">
+    How to use streaming mode for large-scale datasets and memory-efficient training
+  </div>
+</div>
+
+
+<div class="quarto-title-meta">
+
+    
+  
+    
+  </div>
+  
+
+
+</header>
+
+
+<p>Streaming enables memory-efficient training with large datasets by loading data
+incrementally rather than loading the entire dataset into memory at once.</p>
+<p>Use streaming when:</p>
+<ul>
+<li>Your dataset is too large to fit in memory (e.g.&nbsp;when you’re doing pretraining with massive text corpora)</li>
+<li>You want to start training immediately without preprocessing the entire dataset</li>
+</ul>
+<p>Streaming works with both remote and locally stored datasets!</p>
+<div class="callout callout-style-default callout-note callout-titled">
+<div class="callout-header d-flex align-content-center">
+<div class="callout-icon-container">
+<i class="callout-icon"></i>
+</div>
+<div class="callout-title-container flex-fill">
+Note
+</div>
+</div>
+<div class="callout-body-container callout-body">
+<p>Streaming currently only supports a single dataset. Multi-dataset support will be added soon.</p>
+</div>
+</div>
+<section id="configuration" class="level2">
+<h2 class="anchored" data-anchor-id="configuration">Configuration</h2>
+<section id="basic-streaming" class="level3">
+<h3 class="anchored" data-anchor-id="basic-streaming">Basic Streaming</h3>
+<p>Enable streaming mode by setting the <code>streaming</code> flag:</p>
+<div class="sourceCode" id="cb1"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="pretraining-with-streaming" class="level3">
+<h3 class="anchored" data-anchor-id="pretraining-with-streaming">Pretraining with Streaming</h3>
+<p>For pretraining tasks, streaming is automatically enabled when using <code>pretraining_dataset</code>:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb2-1"><a href="#cb2-1" aria-hidden="true" tabindex="-1"></a><span class="fu">pretraining_dataset</span><span class="kw">:</span></span>
+<span id="cb2-2"><a href="#cb2-2" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> HuggingFaceFW/fineweb-edu</span></span>
+<span id="cb2-3"><a href="#cb2-3" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">type</span><span class="kw">:</span><span class="at"> pretrain</span></span>
+<span id="cb2-4"><a href="#cb2-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">text_column</span><span class="kw">:</span><span class="at"> text</span></span>
+<span id="cb2-5"><a href="#cb2-5" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
+<span id="cb2-6"><a href="#cb2-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb2-7"><a href="#cb2-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Optionally, enable sample packing</span></span>
+<span id="cb2-8"><a href="#cb2-8" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming_multipack_buffer_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">10000</span></span>
+<span id="cb2-9"><a href="#cb2-9" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+<section id="sft-with-streaming" class="level3">
+<h3 class="anchored" data-anchor-id="sft-with-streaming">SFT with Streaming</h3>
+<p>For supervised fine-tuning with streaming:</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb3-1"><a href="#cb3-1" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb3-2"><a href="#cb3-2" aria-hidden="true" tabindex="-1"></a><span class="fu">datasets</span><span class="kw">:</span></span>
+<span id="cb3-3"><a href="#cb3-3" aria-hidden="true" tabindex="-1"></a><span class="at">  </span><span class="kw">-</span><span class="at"> </span><span class="fu">path</span><span class="kw">:</span><span class="at"> tatsu-lab/alpaca</span></span>
+<span id="cb3-4"><a href="#cb3-4" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">type</span><span class="kw">:</span><span class="at"> alpaca</span></span>
+<span id="cb3-5"><a href="#cb3-5" aria-hidden="true" tabindex="-1"></a><span class="at">    </span><span class="fu">split</span><span class="kw">:</span><span class="at"> train</span></span>
+<span id="cb3-6"><a href="#cb3-6" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb3-7"><a href="#cb3-7" aria-hidden="true" tabindex="-1"></a><span class="co"># Optionally, enable sample packing</span></span>
+<span id="cb3-8"><a href="#cb3-8" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming_multipack_buffer_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">10000</span></span>
+<span id="cb3-9"><a href="#cb3-9" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+</section>
+</section>
+<section id="configuration-options" class="level2">
+<h2 class="anchored" data-anchor-id="configuration-options">Configuration Options</h2>
+<section id="streaming_multipack_buffer_size" class="level3">
+<h3 class="anchored" data-anchor-id="streaming_multipack_buffer_size"><code>streaming_multipack_buffer_size</code></h3>
+<p>Controls the buffer size for multipack streaming (default: 10,000). This determines how
+many samples are buffered before packing. Larger buffers can improve packing efficiency
+but use more memory.</p>
+</section>
+<section id="shuffle_merged_datasets" class="level3">
+<h3 class="anchored" data-anchor-id="shuffle_merged_datasets"><code>shuffle_merged_datasets</code></h3>
+<p>When enabled, shuffles the streaming dataset using the buffer. This requires additional
+memory for the shuffle buffer.</p>
+</section>
+</section>
+<section id="sample-packing-with-streaming" class="level2">
+<h2 class="anchored" data-anchor-id="sample-packing-with-streaming">Sample Packing with Streaming</h2>
+<p>Sample packing is supported for streaming datasets. When enabled, multiple samples are
+packed into a single sequence to maximize GPU utilization:</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode yaml code-with-copy"><code class="sourceCode yaml"><span id="cb4-1"><a href="#cb4-1" aria-hidden="true" tabindex="-1"></a><span class="fu">sample_packing</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span></span>
+<span id="cb4-2"><a href="#cb4-2" aria-hidden="true" tabindex="-1"></a><span class="fu">streaming_multipack_buffer_size</span><span class="kw">:</span><span class="at"> </span><span class="dv">10000</span></span>
+<span id="cb4-3"><a href="#cb4-3" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb4-4"><a href="#cb4-4" aria-hidden="true" tabindex="-1"></a><span class="co"># For SFT: attention is automatically isolated between packed samples</span></span>
+<span id="cb4-5"><a href="#cb4-5" aria-hidden="true" tabindex="-1"></a><span class="co"># For pretraining: control with pretrain_multipack_attn</span></span>
+<span id="cb4-6"><a href="#cb4-6" aria-hidden="true" tabindex="-1"></a><span class="fu">pretrain_multipack_attn</span><span class="kw">:</span><span class="at"> </span><span class="ch">true</span><span class="co">  # prevent cross-attention between packed samples</span></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<p>For more information, see our <a href="../docs/multipack.html">documentation</a> on multipacking.</p>
+</section>
+<section id="important-considerations" class="level2">
+<h2 class="anchored" data-anchor-id="important-considerations">Important Considerations</h2>
+<section id="memory-usage" class="level3">
+<h3 class="anchored" data-anchor-id="memory-usage">Memory Usage</h3>
+<p>While streaming reduces memory usage compared to loading entire datasets, you still need
+to consider:</p>
+<ul>
+<li>You can control the memory usage by adjusting <code>streaming_multipack_buffer_size</code></li>
+<li>Sample packing requires buffering multiple samples</li>
+<li>Shuffling requires additional memory for the shuffle buffer</li>
+</ul>
+</section>
+<section id="performance" class="level3">
+<h3 class="anchored" data-anchor-id="performance">Performance</h3>
+<ul>
+<li>Streaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly</li>
+<li>Network speed and disk read speed are important when streaming from remote sources or a local dataset, respectively</li>
+<li>Consider using <code>axolotl preprocess</code> for smaller or more frequently used datasets</li>
+</ul>
+</section>
+<section id="evaluation-datasets" class="level3">
+<h3 class="anchored" data-anchor-id="evaluation-datasets">Evaluation Datasets</h3>
+<p>Evaluation datasets are not streamed to ensure consistent evaluation metrics. They’re
+loaded normally even when training uses streaming.</p>
+</section>
+</section>
+<section id="examples" class="level2">
+<h2 class="anchored" data-anchor-id="examples">Examples</h2>
+<p>See the <code>examples/streaming/</code> directory for complete configuration examples:</p>
+<ul>
+<li><code>pretrain.yaml</code>: Pretraining with streaming dataset</li>
+<li><code>sft.yaml</code>: Supervised fine-tuning with streaming</li>
+</ul>
+
+
+</section>
+
+</main> <!-- /main -->
+<script id="quarto-html-after-body" type="application/javascript">
+  window.document.addEventListener("DOMContentLoaded", function (event) {
+    const icon = "";
+    const anchorJS = new window.AnchorJS();
+    anchorJS.options = {
+      placement: 'right',
+      icon: icon
+    };
+    anchorJS.add('.anchored');
+    const isCodeAnnotation = (el) => {
+      for (const clz of el.classList) {
+        if (clz.startsWith('code-annotation-')) {                     
+          return true;
+        }
+      }
+      return false;
+    }
+    const onCopySuccess = function(e) {
+      // button target
+      const button = e.trigger;
+      // don't keep focus
+      button.blur();
+      // flash "checked"
+      button.classList.add('code-copy-button-checked');
+      var currentTitle = button.getAttribute("title");
+      button.setAttribute("title", "Copied!");
+      let tooltip;
+      if (window.bootstrap) {
+        button.setAttribute("data-bs-toggle", "tooltip");
+        button.setAttribute("data-bs-placement", "left");
+        button.setAttribute("data-bs-title", "Copied!");
+        tooltip = new bootstrap.Tooltip(button, 
+          { trigger: "manual", 
+            customClass: "code-copy-button-tooltip",
+            offset: [0, -8]});
+        tooltip.show();    
+      }
+      setTimeout(function() {
+        if (tooltip) {
+          tooltip.hide();
+          button.removeAttribute("data-bs-title");
+          button.removeAttribute("data-bs-toggle");
+          button.removeAttribute("data-bs-placement");
+        }
+        button.setAttribute("title", currentTitle);
+        button.classList.remove('code-copy-button-checked');
+      }, 1000);
+      // clear code selection
+      e.clearSelection();
+    }
+    const getTextToCopy = function(trigger) {
+        const codeEl = trigger.previousElementSibling.cloneNode(true);
+        for (const childEl of codeEl.children) {
+          if (isCodeAnnotation(childEl)) {
+            childEl.remove();
+          }
+        }
+        return codeEl.innerText;
+    }
+    const clipboard = new window.ClipboardJS('.code-copy-button:not([data-in-quarto-modal])', {
+      text: getTextToCopy
+    });
+    clipboard.on('success', onCopySuccess);
+    if (window.document.getElementById('quarto-embedded-source-code-modal')) {
+      const clipboardModal = new window.ClipboardJS('.code-copy-button[data-in-quarto-modal]', {
+        text: getTextToCopy,
+        container: window.document.getElementById('quarto-embedded-source-code-modal')
+      });
+      clipboardModal.on('success', onCopySuccess);
+    }
+      var localhostRegex = new RegExp(/^(?:http|https):\/\/localhost\:?[0-9]*\//);
+      var mailtoRegex = new RegExp(/^mailto:/);
+        var filterRegex = new RegExp("https:\/\/docs\.axolotl\.ai");
+      var isInternal = (href) => {
+          return filterRegex.test(href) || localhostRegex.test(href) || mailtoRegex.test(href);
+      }
+      // Inspect non-navigation links and adorn them if external
+     var links = window.document.querySelectorAll('a[href]:not(.nav-link):not(.navbar-brand):not(.toc-action):not(.sidebar-link):not(.sidebar-item-toggle):not(.pagination-link):not(.no-external):not([aria-hidden]):not(.dropdown-item):not(.quarto-navigation-tool):not(.about-link)');
+      for (var i=0; i<links.length; i++) {
+        const link = links[i];
+        if (!isInternal(link.href)) {
+          // undo the damage that might have been done by quarto-nav.js in the case of
+          // links that we want to consider external
+          if (link.dataset.originalHref !== undefined) {
+            link.href = link.dataset.originalHref;
+          }
+        }
+      }
+    function tippyHover(el, contentFn, onTriggerFn, onUntriggerFn) {
+      const config = {
+        allowHTML: true,
+        maxWidth: 500,
+        delay: 100,
+        arrow: false,
+        appendTo: function(el) {
+            return el.parentElement;
+        },
+        interactive: true,
+        interactiveBorder: 10,
+        theme: 'quarto',
+        placement: 'bottom-start',
+      };
+      if (contentFn) {
+        config.content = contentFn;
+      }
+      if (onTriggerFn) {
+        config.onTrigger = onTriggerFn;
+      }
+      if (onUntriggerFn) {
+        config.onUntrigger = onUntriggerFn;
+      }
+      window.tippy(el, config); 
+    }
+    const noterefs = window.document.querySelectorAll('a[role="doc-noteref"]');
+    for (var i=0; i<noterefs.length; i++) {
+      const ref = noterefs[i];
+      tippyHover(ref, function() {
+        // use id or data attribute instead here
+        let href = ref.getAttribute('data-footnote-href') || ref.getAttribute('href');
+        try { href = new URL(href).hash; } catch {}
+        const id = href.replace(/^#\/?/, "");
+        const note = window.document.getElementById(id);
+        if (note) {
+          return note.innerHTML;
+        } else {
+          return "";
+        }
+      });
+    }
+    const xrefs = window.document.querySelectorAll('a.quarto-xref');
+    const processXRef = (id, note) => {
+      // Strip column container classes
+      const stripColumnClz = (el) => {
+        el.classList.remove("page-full", "page-columns");
+        if (el.children) {
+          for (const child of el.children) {
+            stripColumnClz(child);
+          }
+        }
+      }
+      stripColumnClz(note)
+      if (id === null || id.startsWith('sec-')) {
+        // Special case sections, only their first couple elements
+        const container = document.createElement("div");
+        if (note.children && note.children.length > 2) {
+          container.appendChild(note.children[0].cloneNode(true));
+          for (let i = 1; i < note.children.length; i++) {
+            const child = note.children[i];
+            if (child.tagName === "P" && child.innerText === "") {
+              continue;
+            } else {
+              container.appendChild(child.cloneNode(true));
+              break;
+            }
+          }
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(container);
+          }
+          return container.innerHTML
+        } else {
+          if (window.Quarto?.typesetMath) {
+            window.Quarto.typesetMath(note);
+          }
+          return note.innerHTML;
+        }
+      } else {
+        // Remove any anchor links if they are present
+        const anchorLink = note.querySelector('a.anchorjs-link');
+        if (anchorLink) {
+          anchorLink.remove();
+        }
+        if (window.Quarto?.typesetMath) {
+          window.Quarto.typesetMath(note);
+        }
+        if (note.classList.contains("callout")) {
+          return note.outerHTML;
+        } else {
+          return note.innerHTML;
+        }
+      }
+    }
+    for (var i=0; i<xrefs.length; i++) {
+      const xref = xrefs[i];
+      tippyHover(xref, undefined, function(instance) {
+        instance.disable();
+        let url = xref.getAttribute('href');
+        let hash = undefined; 
+        if (url.startsWith('#')) {
+          hash = url;
+        } else {
+          try { hash = new URL(url).hash; } catch {}
+        }
+        if (hash) {
+          const id = hash.replace(/^#\/?/, "");
+          const note = window.document.getElementById(id);
+          if (note !== null) {
+            try {
+              const html = processXRef(id, note.cloneNode(true));
+              instance.setContent(html);
+            } finally {
+              instance.enable();
+              instance.show();
+            }
+          } else {
+            // See if we can fetch this
+            fetch(url.split('#')[0])
+            .then(res => res.text())
+            .then(html => {
+              const parser = new DOMParser();
+              const htmlDoc = parser.parseFromString(html, "text/html");
+              const note = htmlDoc.getElementById(id);
+              if (note !== null) {
+                const html = processXRef(id, note);
+                instance.setContent(html);
+              } 
+            }).finally(() => {
+              instance.enable();
+              instance.show();
+            });
+          }
+        } else {
+          // See if we can fetch a full url (with no hash to target)
+          // This is a special case and we should probably do some content thinning / targeting
+          fetch(url)
+          .then(res => res.text())
+          .then(html => {
+            const parser = new DOMParser();
+            const htmlDoc = parser.parseFromString(html, "text/html");
+            const note = htmlDoc.querySelector('main.content');
+            if (note !== null) {
+              // This should only happen for chapter cross references
+              // (since there is no id in the URL)
+              // remove the first header
+              if (note.children.length > 0 && note.children[0].tagName === "HEADER") {
+                note.children[0].remove();
+              }
+              const html = processXRef(null, note);
+              instance.setContent(html);
+            } 
+          }).finally(() => {
+            instance.enable();
+            instance.show();
+          });
+        }
+      }, function(instance) {
+      });
+    }
+        let selectedAnnoteEl;
+        const selectorForAnnotation = ( cell, annotation) => {
+          let cellAttr = 'data-code-cell="' + cell + '"';
+          let lineAttr = 'data-code-annotation="' +  annotation + '"';
+          const selector = 'span[' + cellAttr + '][' + lineAttr + ']';
+          return selector;
+        }
+        const selectCodeLines = (annoteEl) => {
+          const doc = window.document;
+          const targetCell = annoteEl.getAttribute("data-target-cell");
+          const targetAnnotation = annoteEl.getAttribute("data-target-annotation");
+          const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation));
+          const lines = annoteSpan.getAttribute("data-code-lines").split(",");
+          const lineIds = lines.map((line) => {
+            return targetCell + "-" + line;
+          })
+          let top = null;
+          let height = null;
+          let parent = null;
+          if (lineIds.length > 0) {
+              //compute the position of the single el (top and bottom and make a div)
+              const el = window.document.getElementById(lineIds[0]);
+              top = el.offsetTop;
+              height = el.offsetHeight;
+              parent = el.parentElement.parentElement;
+            if (lineIds.length > 1) {
+              const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]);
+              const bottom = lastEl.offsetTop + lastEl.offsetHeight;
+              height = bottom - top;
+            }
+            if (top !== null && height !== null && parent !== null) {
+              // cook up a div (if necessary) and position it 
+              let div = window.document.getElementById("code-annotation-line-highlight");
+              if (div === null) {
+                div = window.document.createElement("div");
+                div.setAttribute("id", "code-annotation-line-highlight");
+                div.style.position = 'absolute';
+                parent.appendChild(div);
+              }
+              div.style.top = top - 2 + "px";
+              div.style.height = height + 4 + "px";
+              div.style.left = 0;
+              let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter");
+              if (gutterDiv === null) {
+                gutterDiv = window.document.createElement("div");
+                gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter");
+                gutterDiv.style.position = 'absolute';
+                const codeCell = window.document.getElementById(targetCell);
+                const gutter = codeCell.querySelector('.code-annotation-gutter');
+                gutter.appendChild(gutterDiv);
+              }
+              gutterDiv.style.top = top - 2 + "px";
+              gutterDiv.style.height = height + 4 + "px";
+            }
+            selectedAnnoteEl = annoteEl;
+          }
+        };
+        const unselectCodeLines = () => {
+          const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"];
+          elementsIds.forEach((elId) => {
+            const div = window.document.getElementById(elId);
+            if (div) {
+              div.remove();
+            }
+          });
+          selectedAnnoteEl = undefined;
+        };
+          // Handle positioning of the toggle
+      window.addEventListener(
+        "resize",
+        throttle(() => {
+          elRect = undefined;
+          if (selectedAnnoteEl) {
+            selectCodeLines(selectedAnnoteEl);
+          }
+        }, 10)
+      );
+      function throttle(fn, ms) {
+      let throttle = false;
+      let timer;
+        return (...args) => {
+          if(!throttle) { // first call gets through
+              fn.apply(this, args);
+              throttle = true;
+          } else { // all the others get throttled
+              if(timer) clearTimeout(timer); // cancel #2
+              timer = setTimeout(() => {
+                fn.apply(this, args);
+                timer = throttle = false;
+              }, ms);
+          }
+        };
+      }
+        // Attach click handler to the DT
+        const annoteDls = window.document.querySelectorAll('dt[data-target-cell]');
+        for (const annoteDlNode of annoteDls) {
+          annoteDlNode.addEventListener('click', (event) => {
+            const clickedEl = event.target;
+            if (clickedEl !== selectedAnnoteEl) {
+              unselectCodeLines();
+              const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active');
+              if (activeEl) {
+                activeEl.classList.remove('code-annotation-active');
+              }
+              selectCodeLines(clickedEl);
+              clickedEl.classList.add('code-annotation-active');
+            } else {
+              // Unselect the line
+              unselectCodeLines();
+              clickedEl.classList.remove('code-annotation-active');
+            }
+          });
+        }
+    const findCites = (el) => {
+      const parentEl = el.parentElement;
+      if (parentEl) {
+        const cites = parentEl.dataset.cites;
+        if (cites) {
+          return {
+            el,
+            cites: cites.split(' ')
+          };
+        } else {
+          return findCites(el.parentElement)
+        }
+      } else {
+        return undefined;
+      }
+    };
+    var bibliorefs = window.document.querySelectorAll('a[role="doc-biblioref"]');
+    for (var i=0; i<bibliorefs.length; i++) {
+      const ref = bibliorefs[i];
+      const citeInfo = findCites(ref);
+      if (citeInfo) {
+        tippyHover(citeInfo.el, function() {
+          var popup = window.document.createElement('div');
+          citeInfo.cites.forEach(function(cite) {
+            var citeDiv = window.document.createElement('div');
+            citeDiv.classList.add('hanging-indent');
+            citeDiv.classList.add('csl-entry');
+            var biblioDiv = window.document.getElementById('ref-' + cite);
+            if (biblioDiv) {
+              citeDiv.innerHTML = biblioDiv.innerHTML;
+            }
+            popup.appendChild(citeDiv);
+          });
+          return popup.innerHTML;
+        });
+      }
+    }
+  });
+  </script>
+</div> <!-- /content -->
+
+
+
+
+</body></html>
\ No newline at end of file
diff --git a/docs/torchao.html b/docs/torchao.html
index 673e65d98..c6ab20338 100644
--- a/docs/torchao.html
+++ b/docs/torchao.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/docs/unsloth.html b/docs/unsloth.html
index db3f55009..c00506fff 100644
--- a/docs/unsloth.html
+++ b/docs/unsloth.html
@@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/examples/colab-notebooks/colab-axolotl-example.html b/examples/colab-notebooks/colab-axolotl-example.html
index c808c98d9..648cd6cc6 100644
--- a/examples/colab-notebooks/colab-axolotl-example.html
+++ b/examples/colab-notebooks/colab-axolotl-example.html
@@ -382,6 +382,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/index.html b/index.html
index 9ecfe4c91..5cff88e2b 100644
--- a/index.html
+++ b/index.html
@@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="./docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="./docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/search.json b/search.json
index 343a56fc8..aefb92de1 100644
--- a/search.json
+++ b/search.json
@@ -116,1033 +116,1298 @@
     ]
   },
   {
-    "objectID": "docs/torchao.html",
-    "href": "docs/torchao.html",
-    "title": "PyTorch ao",
+    "objectID": "docs/streaming.html",
+    "href": "docs/streaming.html",
+    "title": "Streaming Datasets",
     "section": "",
-    "text": "To use experimental optimizers (AdamWFp8, AdamW4bit, AdamW8bit) from Pytorch Ao, please install the package as shown below.\n\n\n\n\n\n\nTip\n\n\n\nSome experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package!\n\n\n\nInstallation\nStable Release from the PyTorch index\npip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124\nNightly release\npip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124",
+    "text": "Streaming enables memory-efficient training with large datasets by loading data\nincrementally rather than loading the entire dataset into memory at once.\nUse streaming when:\nStreaming works with both remote and locally stored datasets!",
+    "crumbs": [
+      "Core Concepts",
+      "Streaming Datasets"
+    ]
+  },
+  {
+    "objectID": "docs/streaming.html#configuration",
+    "href": "docs/streaming.html#configuration",
+    "title": "Streaming Datasets",
+    "section": "Configuration",
+    "text": "Configuration\n\nBasic Streaming\nEnable streaming mode by setting the streaming flag:\nstreaming: true\n\n\nPretraining with Streaming\nFor pretraining tasks, streaming is automatically enabled when using pretraining_dataset:\npretraining_dataset:\n  - path: HuggingFaceFW/fineweb-edu\n    type: pretrain\n    text_column: text\n    split: train\n\n# Optionally, enable sample packing\nstreaming_multipack_buffer_size: 10000\nsample_packing: true\n\n\nSFT with Streaming\nFor supervised fine-tuning with streaming:\nstreaming: true\ndatasets:\n  - path: tatsu-lab/alpaca\n    type: alpaca\n    split: train\n\n# Optionally, enable sample packing\nstreaming_multipack_buffer_size: 10000\nsample_packing: true",
+    "crumbs": [
+      "Core Concepts",
+      "Streaming Datasets"
+    ]
+  },
+  {
+    "objectID": "docs/streaming.html#configuration-options",
+    "href": "docs/streaming.html#configuration-options",
+    "title": "Streaming Datasets",
+    "section": "Configuration Options",
+    "text": "Configuration Options\n\nstreaming_multipack_buffer_size\nControls the buffer size for multipack streaming (default: 10,000). This determines how\nmany samples are buffered before packing. Larger buffers can improve packing efficiency\nbut use more memory.\n\n\nshuffle_merged_datasets\nWhen enabled, shuffles the streaming dataset using the buffer. This requires additional\nmemory for the shuffle buffer.",
+    "crumbs": [
+      "Core Concepts",
+      "Streaming Datasets"
+    ]
+  },
+  {
+    "objectID": "docs/streaming.html#sample-packing-with-streaming",
+    "href": "docs/streaming.html#sample-packing-with-streaming",
+    "title": "Streaming Datasets",
+    "section": "Sample Packing with Streaming",
+    "text": "Sample Packing with Streaming\nSample packing is supported for streaming datasets. When enabled, multiple samples are\npacked into a single sequence to maximize GPU utilization:\nsample_packing: true\nstreaming_multipack_buffer_size: 10000\n\n# For SFT: attention is automatically isolated between packed samples\n# For pretraining: control with pretrain_multipack_attn\npretrain_multipack_attn: true  # prevent cross-attention between packed samples\nFor more information, see our documentation on multipacking.",
+    "crumbs": [
+      "Core Concepts",
+      "Streaming Datasets"
+    ]
+  },
+  {
+    "objectID": "docs/streaming.html#important-considerations",
+    "href": "docs/streaming.html#important-considerations",
+    "title": "Streaming Datasets",
+    "section": "Important Considerations",
+    "text": "Important Considerations\n\nMemory Usage\nWhile streaming reduces memory usage compared to loading entire datasets, you still need\nto consider:\n\nYou can control the memory usage by adjusting streaming_multipack_buffer_size\nSample packing requires buffering multiple samples\nShuffling requires additional memory for the shuffle buffer\n\n\n\nPerformance\n\nStreaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly\nNetwork speed and disk read speed are important when streaming from remote sources or a local dataset, respectively\nConsider using axolotl preprocess for smaller or more frequently used datasets\n\n\n\nEvaluation Datasets\nEvaluation datasets are not streamed to ensure consistent evaluation metrics. They’re\nloaded normally even when training uses streaming.",
+    "crumbs": [
+      "Core Concepts",
+      "Streaming Datasets"
+    ]
+  },
+  {
+    "objectID": "docs/streaming.html#examples",
+    "href": "docs/streaming.html#examples",
+    "title": "Streaming Datasets",
+    "section": "Examples",
+    "text": "Examples\nSee the examples/streaming/ directory for complete configuration examples:\n\npretrain.yaml: Pretraining with streaming dataset\nsft.yaml: Supervised fine-tuning with streaming",
+    "crumbs": [
+      "Core Concepts",
+      "Streaming Datasets"
+    ]
+  },
+  {
+    "objectID": "docs/nccl.html",
+    "href": "docs/nccl.html",
+    "title": "NCCL",
+    "section": "",
+    "text": "NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several environment variables. A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:\nWatchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.\nOften, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends disabling PCI access control services (ACS) as a possible solution if this is available to you.\nForcing cross-GPU communication via NVLink may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:\nnvidia-smi nvlink --status\nTo force NCCL to use NVLink, simply set this in the environment:\nexport NCCL_P2P_LEVEL=NVL\nIf NVLink is not available in your environment there are other options for NCCL_P2P_LEVEL in the table below:\n\n\n\n\n\n\n\nNCCL_P2P_LEVEL\nDescription\n\n\n\n\nPIX\nP2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication.\n\n\nPXB\nP2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency.\n\n\nPHB\nP2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL)\n\n\n\nTo validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:\n./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3\nIt can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:\nexport NCCL_DEBUG=INFO\nexport NCCL_DEBUG_SUBSYS=ALL\nexport TORCH_DISTRIBUTED_DEBUG=INFO\nexport TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log\nFinally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ddp_timeout value in the Axolotl configuration. See PyTorch init_process_group for documentation on this value.",
+    "crumbs": [
+      "Troubleshooting",
+      "NCCL"
+    ]
+  },
+  {
+    "objectID": "docs/quantize.html",
+    "href": "docs/quantize.html",
+    "title": "Quantization with torchao",
+    "section": "",
+    "text": "Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the torchao library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).",
+    "crumbs": [
+      "How To Guides",
+      "Quantization with torchao"
+    ]
+  },
+  {
+    "objectID": "docs/quantize.html#configuring-quantization-in-axolotl",
+    "href": "docs/quantize.html#configuring-quantization-in-axolotl",
+    "title": "Quantization with torchao",
+    "section": "Configuring Quantization in Axolotl",
+    "text": "Configuring Quantization in Axolotl\nQuantization is configured using the quantization key in your configuration file.\nbase_model: # The path to the model to quantize.\nquantization:\n  weight_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8\n  activation_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for activation quantization. Valid options are \"int4\" and \"int8\"\n  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization\n  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.\n\noutput_dir:  # The path to the output directory.\nOnce quantization is complete, your quantized model will be saved in the {output_dir}/quantized directory.\nYou may also use the quantize command to quantize a model which has been trained with QAT - you can do this by using the existing QAT configuration file which\nyou used to train the model:\n# qat.yml\nqat:\n  activation_dtype: int8\n  weight_dtype: int8\n  group_size: 256\n  quantize_embedding: true\n\noutput_dir: # The path to the output directory used during training where the final checkpoint has been saved.\naxolotl quantize qat.yml\nThis ensures that an identical quantization configuration is used to quantize the model as was used to train it.",
+    "crumbs": [
+      "How To Guides",
+      "Quantization with torchao"
+    ]
+  },
+  {
+    "objectID": "docs/nd_parallelism.html",
+    "href": "docs/nd_parallelism.html",
+    "title": "N-D Parallelism (Beta)",
+    "section": "",
+    "text": "Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:\nor combinations of the above!",
     "crumbs": [
       "Advanced Features",
-      "PyTorch ao"
+      "N-D Parallelism (Beta)"
     ]
   },
   {
-    "objectID": "docs/dataset_preprocessing.html",
-    "href": "docs/dataset_preprocessing.html",
-    "title": "Dataset Preprocessing",
-    "section": "",
-    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "objectID": "docs/nd_parallelism.html#core-concepts",
+    "href": "docs/nd_parallelism.html#core-concepts",
+    "title": "N-D Parallelism (Beta)",
+    "section": "Core Concepts",
+    "text": "Core Concepts\nParallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch’s DeviceMesh is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.\n\nData Parallelism\nData Parallelism focuses on splitting the global data batch across GPUs.\n\nDistributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.\nFully Sharded Data Parallel (FSDP): A highly memory-efficient form of data parallelism (inspired by DeepSpeed’s ZeRO). Instead of replicating the model, FSDP shards the model’s parameters, gradients, and optimizer states across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an all_gather operation just before they are used, and they can be discarded immediately after (reshard-after-forward).\n\nFSDP maps to ZeRO stages:\n\nZeRO-2 (reshard_after_forward=False): Shards gradients and optimizer states. Model weights are replicated on each GPU.\nZeRO-3 (reshard_after_forward=True): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).\n\n\n\n\n\n[Experimental] Tensor Parallelism (TP)\nAlso known as “horizontal model parallelism,” as described in the Megatron-LM paper. Instead of splitting the batch, TP splits the model’s layers themselves across GPUs.\n\nHow it works: For a linear layer Y = XA, the weight matrix A is split column-wise (A = [A_1, A_2]). The computation becomes Y_1 = XA_1 and Y_2 = XA_2, which can happen in parallel on different GPUs. The final output Y is simply the concatenation of Y_1 and Y_2. Check this comment for more detailed info.\nRequirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.\n\n\n\nContext Parallelism (CP)\nContext Parallelism, also called Sequence Parallelism, addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.\n\nHow it works: If you have a sequence of 8192 tokens and a context_parallel_size of 4, each GPU will only handle a chunk of 2048 tokens.\nThe Challenge: Attention is not local; every token needs to “attend to” every other token. Splitting the sequence breaks this.\nThe Solution (ring-flash-attention): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a “ring.” After N-1 steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized flash-attention kernel at each step.\n\n\n\nHybrid Sharding Data Parallel (HSDP)\nHSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.\n\nIntra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the all_gather operations for sharded parameters fast.\nInter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP’s parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).\nExample: With 2 nodes of 8 GPUs each (16 total), you could have dp_shard_size=8 (FSDP within each node) and dp_replicate_size=2 (DDP across the two nodes).",
     "crumbs": [
-      "Core Concepts",
-      "Dataset Preprocessing"
+      "Advanced Features",
+      "N-D Parallelism (Beta)"
     ]
   },
   {
-    "objectID": "docs/dataset_preprocessing.html#overview",
-    "href": "docs/dataset_preprocessing.html#overview",
-    "title": "Dataset Preprocessing",
-    "section": "",
-    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "objectID": "docs/nd_parallelism.html#usage",
+    "href": "docs/nd_parallelism.html#usage",
+    "title": "N-D Parallelism (Beta)",
+    "section": "Usage",
+    "text": "Usage\n# FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp\nfsdp_version: 2\nfsdp_config:\n  # ...\n\n# The number of GPUs to shard the model parameters across (FSDP dimension).\ndp_shard_size: 4\n\n# The number of times to replicate the sharded model (DDP dimension).\ndp_replicate_size: 2\n\n# Number of GPUs for Tensor Parallelism.\ntensor_parallel_size: 1  # (default is 1, no TP)\n\n# Number of GPUs for Context/Sequence Parallelism.\ncontext_parallel_size: 1 # (default is 1, no CP)\nNote: We recommend FSDP. DeepSpeed is only compatible with tensor_parallel_size.",
     "crumbs": [
-      "Core Concepts",
-      "Dataset Preprocessing"
+      "Advanced Features",
+      "N-D Parallelism (Beta)"
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html",
-    "href": "docs/multi-gpu.html",
-    "title": "Multi-GPU",
-    "section": "",
-    "text": "This guide covers advanced training configurations for multi-GPU setups using Axolotl.",
+    "objectID": "docs/nd_parallelism.html#examples",
+    "href": "docs/nd_parallelism.html#examples",
+    "title": "N-D Parallelism (Beta)",
+    "section": "Examples",
+    "text": "Examples\n\n\n\n\n\n\nTip\n\n\n\nSee our example configs here.\n\n\n\nHSDP on 2 nodes with 4 GPUs each (8 GPUs total):\n\nYou want FSDP within each node and DDP across nodes.\nSet dp_shard_size: 4 and dp_replicate_size: 2.\n\nFSDP + TP on a single 8-GPU node:\n\nYou want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.\nSet dp_shard_size: 4 and tensor_parallel_size: 2.\n\nFSDP + CP on a single 8-GPU node for long context:\n\nYou want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.\nSet dp_shard_size: 8 and context_parallel_size: 8. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.",
     "crumbs": [
-      "Deployments",
-      "Multi-GPU"
+      "Advanced Features",
+      "N-D Parallelism (Beta)"
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html#sec-overview",
-    "href": "docs/multi-gpu.html#sec-overview",
-    "title": "Multi-GPU",
-    "section": "1 Overview",
-    "text": "1 Overview\nAxolotl supports several methods for multi-GPU training:\n\nDeepSpeed (recommended)\nFSDP (Fully Sharded Data Parallel)\nSequence parallelism\nFSDP + QLoRA",
+    "objectID": "docs/nd_parallelism.html#support-matrix",
+    "href": "docs/nd_parallelism.html#support-matrix",
+    "title": "N-D Parallelism (Beta)",
+    "section": "Support Matrix",
+    "text": "Support Matrix\nThis matrix describes how different parallelism methods can be combined in Axolotl.\n\n\n\n\n\n\n\n\n\n\n\nCombination\ndp_replicate_size\ndp_shard_size\ntp_size\ncp_size\nStatus & Notes\n\n\n\n\nFSDP (ZeRO-3)\n1\n&gt;1\n1\n1\n✅ Fully supported. Shards model across all GPUs.\n\n\nHSDP\n&gt;1\n&gt;1\n1\n1\n✅ Fully supported. FSDP intra-node, DDP inter-node.\n\n\nFSDP + TP\n1\n&gt;1\n&gt;1\n1\n✅ 2D Parallelism. Shards the model across a dp_shard group, and TP-splits layers within the tp group.\n\n\nHSDP + TP\n&gt;1\n&gt;1\n&gt;1\n1\n✅ 3D Parallelism. A powerful but complex combination.\n\n\nFSDP + CP\n1\n&gt;1\n1\n&gt;1\n✅ 2D Parallelism. Combines FSDP with context parallelism.\n\n\nFSDP + TP + CP\n1\n&gt;1\n&gt;1\n&gt;1\n✅ 3D Parallelism. Another advanced combination.\n\n\nDDP + TP/CP\n&gt;1\n1\n&gt;1\n&gt;1\n❌ Not Supported. The ParallelismConfig explicitly prevents this, as composing pure DDP with TP or CP is currently not supported. You should use FSDP + TP/CP instead (dp_shard_size &gt; 1).\n\n\nJust TP / CP\n1\n1\n&gt;1\n&gt;1\n✅ Supported. Useful for inference or when the model fits on one GPU but context is too long.\n\n\n\n\ntp_size refers to tensor_parallel_size\ncp_size refers to context_parallel_size",
     "crumbs": [
-      "Deployments",
-      "Multi-GPU"
+      "Advanced Features",
+      "N-D Parallelism (Beta)"
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html#sec-deepspeed",
-    "href": "docs/multi-gpu.html#sec-deepspeed",
-    "title": "Multi-GPU",
-    "section": "2 DeepSpeed",
-    "text": "2 DeepSpeed\n\n2.1 Configuration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\n2.2 Usage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\n2.3 ZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.",
+    "objectID": "docs/custom_integrations.html",
+    "href": "docs/custom_integrations.html",
+    "title": "Custom Integrations",
+    "section": "",
+    "text": "Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.\nTo enable them, please check the respective documentations.",
     "crumbs": [
-      "Deployments",
-      "Multi-GPU"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html#sec-fsdp",
-    "href": "docs/multi-gpu.html#sec-fsdp",
-    "title": "Multi-GPU",
-    "section": "3 Fully Sharded Data Parallel (FSDP)",
-    "text": "3 Fully Sharded Data Parallel (FSDP)\n\n\n\n\n\n\nNote\n\n\n\nFSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.\n\n\n\n3.1 Migrating from FSDP1 to FSDP2\nTo migrate your config from FSDP1 to FSDP2, you must use the fsdp_version top-level config field to specify the FSDP version, and\nalso follow the config field mapping below to update field names.\n\n3.1.1 Config mapping\n\n\n\nFSDP1\nFSDP2\n\n\n\n\nfsdp_sharding_strategy\nreshard_after_forward\n\n\nfsdp_backward_prefetch_policy\nREMOVED\n\n\nfsdp_backward_prefetch\nREMOVED\n\n\nfsdp_forward_prefetch\nREMOVED\n\n\nfsdp_sync_module_states\nREMOVED\n\n\nfsdp_cpu_ram_efficient_loading\ncpu_ram_efficient_loading\n\n\nfsdp_state_dict_type\nstate_dict_type\n\n\nfsdp_use_orig_params\nREMOVED\n\n\n\nFor more details, please see the migration guide in the torchtitan repo. In Axolotl,\nif you were using the following FSDP1 config:\nfsdp_version: 1\nfsdp_config:\n  fsdp_offload_params: false\n  fsdp_cpu_ram_efficient_loading: true\n  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_sharding_strategy: FULL_SHARD\nYou can migrate to the following FSDP2 config:\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true\n\n\n\n3.2 FSDP1 (deprecated)\n\n\n\n\n\n\nNote\n\n\n\nUsing fsdp to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use fsdp_config as above instead.\n\n\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
+    "objectID": "docs/custom_integrations.html#cut-cross-entropy",
+    "href": "docs/custom_integrations.html#cut-cross-entropy",
+    "title": "Custom Integrations",
+    "section": "Cut Cross Entropy",
+    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\narcee\ncohere\ncohere2\ngemma\ngemma2\ngemma3\ngemma3_text\ngemma3n\ngemma3n_text\nglm\nglm4\ngpt_oss\ngranite\ngranitemoe\nhunyuan_v1_dense\nhunyuan_v1_moe\nllama\nllama4\nllama4_text\nmistral\nmistral3\nmixtral\nmllama\nphi\nphi3\nphi4_multimodal\nqwen2\nqwen2_vl\nqwen2_moe\nqwen2_5_vl\nqwen3\nqwen3_moe\nsmollm3\nvoxtral\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
     "crumbs": [
-      "Deployments",
-      "Multi-GPU"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html#sec-sequence-parallelism",
-    "href": "docs/multi-gpu.html#sec-sequence-parallelism",
-    "title": "Multi-GPU",
-    "section": "4 Sequence parallelism",
-    "text": "4 Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nSee our dedicated guide for more information.\n\n4.1 FSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.",
+    "objectID": "docs/custom_integrations.html#densemixer",
+    "href": "docs/custom_integrations.html#densemixer",
+    "title": "Custom Integrations",
+    "section": "DenseMixer",
+    "text": "DenseMixer\nSee DenseMixer\nSimply add the following to your axolotl YAML config:\nplugins:\n  - axolotl.integrations.densemixer.DenseMixerPlugin\nPlease see reference here",
     "crumbs": [
-      "Deployments",
-      "Multi-GPU"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html#sec-performance",
-    "href": "docs/multi-gpu.html#sec-performance",
-    "title": "Multi-GPU",
-    "section": "5 Performance Optimization",
-    "text": "5 Performance Optimization\n\n5.1 Liger Kernel Integration\nPlease see docs for more info.",
+    "objectID": "docs/custom_integrations.html#grokfast",
+    "href": "docs/custom_integrations.html#grokfast",
+    "title": "Custom Integrations",
+    "section": "Grokfast",
+    "text": "Grokfast\nSee https://github.com/ironjr/grokfast\n\nUsage\nplugins:\n  - axolotl.integrations.grokfast.GrokfastPlugin\n\ngrokfast_alpha: 2.0\ngrokfast_lamb: 0.98\n\n\nCitation\n@article{lee2024grokfast,\n    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},\n    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},\n    journal={arXiv preprint arXiv:2405.20233},\n    year={2024}\n}\nPlease see reference here",
     "crumbs": [
-      "Deployments",
-      "Multi-GPU"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/multi-gpu.html#sec-troubleshooting",
-    "href": "docs/multi-gpu.html#sec-troubleshooting",
-    "title": "Multi-GPU",
-    "section": "6 Troubleshooting",
-    "text": "6 Troubleshooting\n\n6.1 NCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\n6.2 Common Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
+    "objectID": "docs/custom_integrations.html#knowledge-distillation-kd",
+    "href": "docs/custom_integrations.html#knowledge-distillation-kd",
+    "title": "Custom Integrations",
+    "section": "Knowledge Distillation (KD)",
+    "text": "Knowledge Distillation (KD)\n\nUsage\nplugins:\n  - \"axolotl.integrations.kd.KDPlugin\"\n\nkd_trainer: True\nkd_ce_alpha: 0.1\nkd_alpha: 0.9\nkd_temperature: 1.0\n\ntorch_compile: True  # torch&gt;=2.6.0, recommended to reduce vram\n\ndatasets:\n  - path: ...\n    type: \"axolotl.integrations.kd.chat_template\"\n    field_messages: \"messages_combined\"\n    logprobs_field: \"llm_text_generation_vllm_logprobs\"  # for kd only, field of logprobs\nAn example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample\nPlease see reference here",
     "crumbs": [
-      "Deployments",
-      "Multi-GPU"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/optimizers.html",
-    "href": "docs/optimizers.html",
-    "title": "Optimizers",
-    "section": "",
-    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "objectID": "docs/custom_integrations.html#llmcompressor",
+    "href": "docs/custom_integrations.html#llmcompressor",
+    "title": "Custom Integrations",
+    "section": "LLMCompressor",
+    "text": "LLMCompressor\nFine-tune sparsified models in Axolotl using Neural Magic’s LLMCompressor.\nThis integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor’s model compression capabilities with Axolotl’s distributed training pipelines, users can efficiently fine-tune sparse models at scale.\nIt uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.\n\n\nRequirements\n\nAxolotl with llmcompressor extras:\npip install \"axolotl[llmcompressor]\"\nRequires llmcompressor &gt;= 0.5.1\n\nThis will install all necessary dependencies to fine-tune sparsified models using the integration.\n\n\n\nUsage\nTo enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:\nplugins:\n  - axolotl.integrations.llm_compressor.LLMCompressorPlugin\n\nllmcompressor:\n  recipe:\n    finetuning_stage:\n      finetuning_modifiers:\n        ConstantPruningModifier:\n          targets: [\n            're:.*q_proj.weight',\n            're:.*k_proj.weight',\n            're:.*v_proj.weight',\n            're:.*o_proj.weight',\n            're:.*gate_proj.weight',\n            're:.*up_proj.weight',\n            're:.*down_proj.weight',\n          ]\n          start: 0\n  save_compressed: true\nThis plugin does not apply pruning or sparsification itself — it is intended for fine-tuning models that have already been sparsified.\nPre-sparsified checkpoints can be:\n- Generated using LLMCompressor\n- Downloaded from Neural Magic’s Hugging Face page\n- Any custom LLM with compatible sparsity patterns that you’ve created yourself\nTo learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:\nhttps://github.com/vllm-project/llm-compressor/blob/main/README.md\n\n\nStorage Optimization with save_compressed\nSetting save_compressed: true in your configuration enables saving models in a compressed format, which:\n- Reduces disk space usage by approximately 40%\n- Maintains compatibility with vLLM for accelerated inference\n- Maintains compatibility with llmcompressor for further optimization (example: quantization)\nThis option is highly recommended when working with sparse models to maximize the benefits of model compression.\n\n\nExample Config\nSee examples/llama-3/sparse-finetuning.yaml for a complete example.\n\n\n\nInference with vLLM\nAfter fine-tuning your sparse model, you can leverage vLLM for efficient inference.\nYou can also use LLMCompressor to apply additional quantization to your fine-tuned\nsparse model before inference for even greater performance benefits.:\nfrom vllm import LLM, SamplingParams\n\nprompts = [\n    \"Hello, my name is\",\n    \"The president of the United States is\",\n    \"The capital of France is\",\n    \"The future of AI is\",\n]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\nllm = LLM(\"path/to/your/sparse/model\")\noutputs = llm.generate(prompts, sampling_params)\n\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\nFor more details on vLLM’s capabilities and advanced configuration options, see the official vLLM documentation.\n\n\nLearn More\nFor details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:\nhttps://github.com/vllm-project/llm-compressor\nPlease see reference here",
     "crumbs": [
-      "Core Concepts",
-      "Optimizers"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/optimizers.html#overview",
-    "href": "docs/optimizers.html#overview",
-    "title": "Optimizers",
-    "section": "",
-    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "objectID": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
+    "href": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
+    "title": "Custom Integrations",
+    "section": "Language Model Evaluation Harness (LM Eval)",
+    "text": "Language Model Evaluation Harness (LM Eval)\nRun evaluation on model using the popular lm-evaluation-harness library.\nSee https://github.com/EleutherAI/lm-evaluation-harness\n\nUsage\nplugins:\n  - axolotl.integrations.lm_eval.LMEvalPlugin\n\nlm_eval_tasks:\n  - gsm8k\n  - hellaswag\n  - arc_easy\n\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\n\n\nCitation\n@misc{eval-harness,\n  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},\n  title        = {A framework for few-shot language model evaluation},\n  month        = 07,\n  year         = 2024,\n  publisher    = {Zenodo},\n  version      = {v0.4.3},\n  doi          = {10.5281/zenodo.12608602},\n  url          = {https://zenodo.org/records/12608602}\n}\nPlease see reference here",
     "crumbs": [
-      "Core Concepts",
-      "Optimizers"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/optimizers.html#custom-optimizers",
-    "href": "docs/optimizers.html#custom-optimizers",
-    "title": "Optimizers",
-    "section": "Custom Optimizers",
-    "text": "Custom Optimizers\nEnable custom optimizers by passing a string to the optimizer argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below.\n\noptimi_adamw\noptimizer: optimi_adamw\n\n\nao_adamw_4bit\nDeprecated: Please use adamw_torch_4bit.\n\n\nao_adamw_8bit\nDeprecated: Please use adamw_torch_8bit.\n\n\nao_adamw_fp8\noptimizer: ao_adamw_fp8\n\n\nadopt_adamw\nGitHub: https://github.com/iShohei220/adopt\nPaper: https://arxiv.org/abs/2411.02853\noptimizer: adopt_adamw\n\n\ncame_pytorch\nGitHub: https://github.com/yangluo7/CAME/tree/master\nPaper: https://arxiv.org/abs/2307.02047\noptimizer: came_pytorch\n\n# optional args (defaults below)\nadam_beta1: 0.9\nadam_beta2: 0.999\nadam_beta3: 0.9999\nadam_epsilon: 1e-30\nadam_epsilon2: 1e-16\n\n\nmuon\nBlog: https://kellerjordan.github.io/posts/muon/\nPaper: https://arxiv.org/abs/2502.16982v1\noptimizer: muon\n\n\ndion\nMicrosoft’s Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient\northonormalizing optimizer that uses low-rank approximations to reduce gradient communication.\nGitHub: https://github.com/microsoft/dion\nPaper: https://arxiv.org/pdf/2504.05295\nNote: Implementation written for PyTorch 2.7+ for DTensor\noptimizer: dion\ndion_lr: 0.01\ndion_momentum: 0.95\nlr: 0.00001  # learning rate for embeddings and parameters that fallback to AdamW",
+    "objectID": "docs/custom_integrations.html#liger-kernels",
+    "href": "docs/custom_integrations.html#liger-kernels",
+    "title": "Custom Integrations",
+    "section": "Liger Kernels",
+    "text": "Liger Kernels\nLiger Kernel provides efficient Triton kernels for LLM training, offering:\n\n20% increase in multi-GPU training throughput\n60% reduction in memory usage\nCompatibility with both FSDP and DeepSpeed\n\nSee https://github.com/linkedin/Liger-Kernel\n\nUsage\nplugins:\n  - axolotl.integrations.liger.LigerPlugin\nliger_rope: true\nliger_rms_norm: true\nliger_glu_activation: true\nliger_layer_norm: true\nliger_fused_linear_cross_entropy: true\n\n\nSupported Models\n\ndeepseek_v2\ngemma\ngemma2\ngemma3\ngranite\njamba\nllama\nmistral\nmixtral\nmllama\nmllama_text_model\nolmo2\npaligemma\nphi3\nqwen2\nqwen2_5_vl\nqwen2_vl\n\n\n\nCitation\n@article{hsu2024ligerkernelefficienttriton,\n      title={Liger Kernel: Efficient Triton Kernels for LLM Training},\n      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},\n      year={2024},\n      eprint={2410.10989},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2410.10989},\n      journal={arXiv preprint arXiv:2410.10989},\n}\nPlease see reference here",
     "crumbs": [
-      "Core Concepts",
-      "Optimizers"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/getting-started.html",
-    "href": "docs/getting-started.html",
-    "title": "Quickstart",
-    "section": "",
-    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
+    "objectID": "docs/custom_integrations.html#spectrum",
+    "href": "docs/custom_integrations.html#spectrum",
+    "title": "Custom Integrations",
+    "section": "Spectrum",
+    "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.\nBy identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n  - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n      title={Spectrum: Targeted Training on Signal to Noise Ratio},\n      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n      year={2024},\n      eprint={2406.06623},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-quick-example",
-    "href": "docs/getting-started.html#sec-quick-example",
-    "title": "Quickstart",
-    "section": "1 Quick Example",
-    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
+    "objectID": "docs/custom_integrations.html#adding-a-new-integration",
+    "href": "docs/custom_integrations.html#adding-a-new-integration",
+    "title": "Custom Integrations",
+    "section": "Adding a new integration",
+    "text": "Adding a new integration\nPlugins can be used to customize the behavior of the training pipeline through hooks. See axolotl.integrations.BasePlugin for the possible hooks.\nTo add a new integration, please follow these steps:\n\nCreate a new folder in the src/axolotl/integrations directory.\nAdd any relevant files (LICENSE, README.md, ACKNOWLEDGEMENTS.md, etc.) to the new folder.\nAdd __init__.py and args.py files to the new folder.\n\n\n__init__.py should import the integration and hook into the appropriate functions.\nargs.py should define the arguments for the integration.\n\n\n(If applicable) Add CPU tests under tests/integrations or GPU tests under tests/e2e/integrations.\n\n\n\n\n\n\n\nTip\n\n\n\nSee src/axolotl/integrations/cut_cross_entropy for a minimal integration example.\n\n\n\n\n\n\n\n\nWarning\n\n\n\nIf you could not load your integration, please ensure you are pip installing in editable mode.\npip install -e .\nand correctly spelled the integration name in the config file.\nplugins:\n  - axolotl.integrations.your_integration_name.YourIntegrationPlugin\n\n\n\n\n\n\n\n\nNote\n\n\n\nIt is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env.\nSee this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "Advanced Features",
+      "Custom Integrations"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-understanding",
-    "href": "docs/getting-started.html#sec-understanding",
-    "title": "Quickstart",
-    "section": "2 Understanding the Process",
-    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
+    "objectID": "docs/debugging.html",
+    "href": "docs/debugging.html",
+    "title": "Debugging",
+    "section": "",
+    "text": "This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-custom",
-    "href": "docs/getting-started.html#sec-custom",
-    "title": "Quickstart",
-    "section": "3 Your First Custom Training",
-    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
+    "objectID": "docs/debugging.html#table-of-contents",
+    "href": "docs/debugging.html#table-of-contents",
+    "title": "Debugging",
+    "section": "Table of Contents",
+    "text": "Table of Contents\n\nGeneral Tips\nDebugging with VSCode\n\nBackground\nConfiguration\nCustomizing your debugger\nVideo Tutorial\n\nDebugging With Docker\n\nSetup\nAttach To Container\nVideo - Attaching To Docker On Remote Host",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-common-tasks",
-    "href": "docs/getting-started.html#sec-common-tasks",
-    "title": "Quickstart",
-    "section": "4 Common Tasks",
-    "text": "4 Common Tasks\n\n\n\n\n\n\nTip\n\n\n\nThe same yaml file is used for training, inference, and merging.\n\n\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nMore details can be found in Inference.\n\n\n4.2 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n\n4.3 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\nPlease make sure to set dataset_prepared_path: in your config to set the path to save the prepared dataset.\nMore details can be found in Dataset Preprocessing.\n\n\n4.4 Merging LoRA weights\nTo merge the LoRA weights back into the base model, run:\naxolotl merge-lora my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nThe merged model will be saved in the {output_dir}/merged directory.\nMore details can be found in Merging LoRA weights.",
+    "objectID": "docs/debugging.html#general-tips",
+    "href": "docs/debugging.html#general-tips",
+    "title": "Debugging",
+    "section": "General Tips",
+    "text": "General Tips\nWhile debugging it’s helpful to simplify your test scenario as much as possible. Here are some tips for doing so:\n\n[!Important]\nAll of these tips are incorporated into the example configuration for debugging with VSCode below.\n\n\nMake sure you are using the latest version of axolotl: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from main.\nEliminate concurrency: Restrict the number of processes to 1 for both training and data preprocessing:\n\nSet CUDA_VISIBLE_DEVICES to a single GPU, ex: export CUDA_VISIBLE_DEVICES=0.\nSet dataset_processes: 1 in your axolotl config or run the training command with --dataset_processes=1.\n\nUse a small dataset: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure sample_packing: False and eval_sample_packing: False to avoid errors. If you are in a pinch and don’t have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):\ndatasets:\n    ...\n    shards: 20\nUse a small model: A good example of a small model is TinyLlama/TinyLlama-1.1B-Chat-v1.0.\nMinimize iteration time: Make sure the training loop finishes as fast as possible, with these settings.\n\nmicro_batch_size: 1\nmax_steps: 1\nval_set_size: 0\n\nClear Caches: Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging.\n\nData preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in dataset_prepared_path: in your axolotl config. If you didn’t set this value, the default is last_run_prepared.\nHF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache HuggingFace cache, by deleting the appropriate ~/.cache/huggingface/datasets/... folder(s).\nThe recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run. This is demonstrated in the example configuration below.",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/getting-started.html#sec-next-steps",
-    "href": "docs/getting-started.html#sec-next-steps",
-    "title": "Quickstart",
-    "section": "5 Next Steps",
-    "text": "5 Next Steps\nNow that you have the basics, you might want to:\n\nTry different model architectures\nExperiment with hyperparameters\nUse more advanced training methods\nScale up to larger models\n\nCheck our other guides for details on these topics:\n\nConfiguration Guide - Full configuration options\nDataset Loading - Loading datasets from various sources\nDataset Formats - Working with different data formats\nMulti-GPU Training\nMulti-Node Training",
+    "objectID": "docs/debugging.html#debugging-with-vscode",
+    "href": "docs/debugging.html#debugging-with-vscode",
+    "title": "Debugging",
+    "section": "Debugging with VSCode",
+    "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n  - path: &lt;path to your chat_template formatted dataset&gt; # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n    type: chat_template\n\n[!Important]\nIf you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip]\nIf you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n    \"version\": \"0.2.0\",\n    \"configurations\": [\n        {\n            \"name\": \"Debug axolotl prompt - chat_template\",\n            \"type\": \"python\",\n            \"module\": \"accelerate.commands.launch\",\n            \"request\": \"launch\",\n            \"args\": [\n                \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n                // The flags below simplify debugging by overriding the axolotl config\n                // with the debugging tips above.  Modify as needed.\n                \"--dataset_processes=1\",      // limits data preprocessing to one process\n                \"--max_steps=1\",              // limits training to just one step\n                \"--batch_size=1\",             // minimizes batch size\n                \"--micro_batch_size=1\",       // minimizes batch size\n                \"--val_set_size=0\",           // disables validation\n                \"--sample_packing=False\",     // disables sample packing which is necessary for small datasets\n                \"--eval_sample_packing=False\",// disables sample packing on eval set\n                \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n                \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n                ],\n            \"console\": \"integratedTerminal\",      // show output in the integrated terminal\n            \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n            \"justMyCode\": true,                   // step through only axolotl code\n            \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\",  // Since we aren't doing distributed training, we need to limit to one GPU\n                    \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n            \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n        }\n    ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip]\nYou may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n    \"version\": \"2.0.0\",\n    \"tasks\": [\n      // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n      {\n        \"label\": \"delete-outputs\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n      // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n      {\n        \"label\": \"delete-temp-hf-dataset-cache\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n        // this task combines the two tasks above\n      {\n       \"label\": \"cleanup-for-dataprep\",\n       \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n      }\n    ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode",
     "crumbs": [
-      "Getting Started",
-      "Quickstart"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html",
-    "href": "docs/ray-integration.html",
-    "title": "Ray Train",
-    "section": "",
-    "text": "Axolotl supports using Ray as an alternative to accelerate for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.\nWith the --use-ray CLI flag, Axolotl will use Ray Train’s TorchTrainer to run training.",
+    "objectID": "docs/debugging.html#debugging-with-docker",
+    "href": "docs/debugging.html#debugging-with-docker",
+    "title": "Debugging",
+    "section": "Debugging With Docker",
+    "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip]\nIf you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1\n\n[!Tip]\nTo understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, perform an editable install of Axolotl:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host",
     "crumbs": [
-      "Deployments",
-      "Ray Train"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html#ray-cluster-setup",
-    "href": "docs/ray-integration.html#ray-cluster-setup",
-    "title": "Ray Train",
-    "section": "Ray cluster setup",
-    "text": "Ray cluster setup\nA prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs here.\nEvery Ray cluster has one head node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this doc.",
+    "objectID": "docs/debugging.html#footnotes",
+    "href": "docs/debugging.html#footnotes",
+    "title": "Debugging",
+    "section": "Footnotes",
+    "text": "Footnotes\n\n\nThe config actually mimics the command CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml, but this is the same thing.↩︎\nMany of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags here.↩︎",
     "crumbs": [
-      "Deployments",
-      "Ray Train"
+      "Troubleshooting",
+      "Debugging"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html#sanity-check",
-    "href": "docs/ray-integration.html#sanity-check",
-    "title": "Ray Train",
-    "section": "Sanity check",
-    "text": "Sanity check\nTo run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:\nray status\nThe output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:\nNode status\n---------------------------------------------------------------\nActive:\n 1 head\nIdle:\n 2 4xL40S:48CPU-384GB\nPending:\n (no pending nodes)\nRecent failures:\n (no failures)\n\nResources\n---------------------------------------------------------------\nUsage:\n 0.0/96.0 CPU\n 0.0/8.0 GPU\n 0B/800.00GiB memory\n 0B/229.57GiB object_store_memory\n\nDemands:\n (no resource demands)\nYou should also be able to see the same on the Ray dashboard.",
+    "objectID": "docs/gradient_checkpointing.html",
+    "href": "docs/gradient_checkpointing.html",
+    "title": "Gradient Checkpointing and Activation Offloading",
+    "section": "",
+    "text": "Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning\nmodels by reducing the memory footprint and improving computational efficiency.\n\nEnabling Gradient Checkpointing\ngradient_checkpointing: true\n\n\nEnabling Activation Offloading\ngradient_checkpointing: true  # required for activation offloading\nactivation_offloading: true\nActivation offloading variants:\nThe default activation_offloading: true offloads activations to CPU and uses CUDA streams\nto overlap the communications and computations when offloading.\nThe activation_offloading: legacy naively offloads activations to CPU and without additional optimizations.\nFor resource constrained environments with limited CPU memory, activation_offloading: disk offloads\nactivations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.",
     "crumbs": [
-      "Deployments",
-      "Ray Train"
+      "Advanced Features",
+      "Gradient Checkpointing and Activation Offloading"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html#configuring-training-with-ray-train",
-    "href": "docs/ray-integration.html#configuring-training-with-ray-train",
-    "title": "Ray Train",
-    "section": "Configuring training with Ray Train",
-    "text": "Configuring training with Ray Train\nYou can find an example configuration at configs/llama-3/lora-1b-ray.yaml.\nThe key parameters to note here are:\nuse_ray: true\nray_num_workers: 4\n# optional\nresources_per_worker:\n    GPU: 1\n\nuse_ray: This is the flag that enables the Ray Train integration. You can either use the corresponding --use-ray flag in the CLI or set use_ray in the config file.\nray_num_workers: This is the number of workers/GPUs to use for training.\nresources_per_worker: This is the Ray resource request for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do\n\nresources_per_worker:\n    accelerator_type:L40S: 0.001",
+    "objectID": "docs/multimodal.html",
+    "href": "docs/multimodal.html",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "",
+    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nSmolVLM2\nLFM2-VL",
     "crumbs": [
-      "Deployments",
-      "Ray Train"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/ray-integration.html#launching-training",
-    "href": "docs/ray-integration.html#launching-training",
-    "title": "Ray Train",
-    "section": "Launching training",
-    "text": "Launching training\nYou can simply run the following command on the head node:\naxolotl train examples/llama-3/lora-1b-ray.yml --use-ray\nThis will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.\nYou can also monitor training progress on the Ray dashboard.\nComing back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let’s say you want to make use of all 8 GPUs. You would be able to just set ray_num_workers: 8 and run the previous command. The Cluster tab will show the following:\n\n\n\nRay dashboard",
+    "objectID": "docs/multimodal.html#supported-models",
+    "href": "docs/multimodal.html#supported-models",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "",
+    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nSmolVLM2\nLFM2-VL",
     "crumbs": [
-      "Deployments",
-      "Ray Train"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/input_output.html",
-    "href": "docs/input_output.html",
-    "title": "Template-free prompt construction",
-    "section": "",
-    "text": "The documentation moved to here."
-  },
-  {
-    "objectID": "docs/multi-node.html",
-    "href": "docs/multi-node.html",
-    "title": "Multi Node",
-    "section": "",
-    "text": "The below are three ways to train multi-node in Axolotl.",
+    "objectID": "docs/multimodal.html#usage",
+    "href": "docs/multimodal.html#usage",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "Usage",
+    "text": "Usage\nMultimodal support is limited and doesn’t have full feature parity.\nHere are the hyperparams you’ll need to use to finetune a multimodal model.\nprocessor_type: AutoProcessor\n\nskip_prepare_dataset: true\nremove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training\nsample_packing: false  # not yet supported with multimodal\n\nchat_template:  # see in next section if specified\n\n# example dataset\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n    field_messages: messages\n\n# (optional) if doing lora, only finetune the Language model,\n# leave the vision model and vision tower frozen\n# load_in_8bit: true\nadapter: lora\nlora_target_modules: 'model.language_model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# (optional) if you want to resize images to a set size\nimage_size: 512\nimage_resize_algorithm: bilinear\nPlease see examples folder for full configs.\n\n\n\n\n\n\nWarning\n\n\n\nSome of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.\n\n\n\nMllama\nbase_model: meta-llama/Llama-3.2-11B-Vision-Instruct\n\nchat_template: llama3_2_vision\n\n\nLlama4\nbase_model: meta-llama/Llama-4-Scout-17B-16E-Instruct\n\nchat_template: llama4\n\n\nPixtral\nbase_model: mistralai/Pixtral-12B-2409\n\nchat_template: pixtral\n\n\nLlava-1.5\nbase_model: llava-hf/llava-1.5-7b-hf\n\nchat_template: llava\n\n\nMistral-Small-3.1\nbase_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503\n\nchat_template: mistral_v7_tekken\n\n\nVoxtral\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install audio lib via pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'\n\n\nbase_model: mistralai/Voxtral-Mini-3B-2507\n\n\nGemma-3\n\n\n\n\n\n\nTip\n\n\n\nThe Gemma3-1B model is a text-only model, so please train as regular text model.\n\n\nFor multi-modal 4B/12B/27B models, use the following config:\nbase_model: google/gemma-3-4b-it\n\nchat_template: gemma3\n\n\nGemma-3n\n\n\n\n\n\n\nWarning\n\n\n\nThe model’s initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.\n\n\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.17\n\n\nbase_model: google/gemma-3n-E2B-it\n\nchat_template: gemma3n\n\n\nQwen2-VL\nbase_model: Qwen/Qwen2-VL-7B-Instruct\n\nchat_template: qwen2_vl\n\n\nQwen2.5-VL\nbase_model: Qwen/Qwen2.5-VL-7B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nSmolVLM2\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install num2words via pip3 install num2words==0.5.14\n\n\nbase_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct\n\n\nLFM2-VL\n\n\n\n\n\n\nWarning\n\n\n\nPlease uninstall causal-conv1d via pip3 uninstall -y causal-conv1d\n\n\nbase_model: LiquidAI/LFM2-VL-450M",
     "crumbs": [
-      "Deployments",
-      "Multi Node"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/multi-node.html#accelerate",
-    "href": "docs/multi-node.html#accelerate",
-    "title": "Multi Node",
-    "section": "Accelerate",
-    "text": "Accelerate\nYou will need to create a configuration for accelerate, either by using accelerate config and follow the instructions or you can use one of the preset below:\n~/.cache/huggingface/accelerate/default_config.yaml\ncompute_environment: LOCAL_MACHINE\ndebug: false\ndistributed_type: FSDP\ndowncast_bf16: 'no'\nmachine_rank: 0 # Set to 0 for the main machine, increment by one for other machines\nmain_process_ip: 10.0.0.4 # Set to main machine's IP\nmain_process_port: 5000\nmain_training_function: main\nmixed_precision: bf16\nnum_machines: 2 # Change to the number of machines\nnum_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)\nrdzv_backend: static\nsame_network: true\ntpu_env: []\ntpu_use_cluster: false\ntpu_use_sudo: false\nuse_cpu: false\nConfigure your model to use FSDP in the Axolotl yaml. For example:\nfsdp_version: 2\nfsdp_config:\n  offload_params: true\n  state_dict_type: FULL_STATE_DICT\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: LlamaDecoderLayer\n  reshard_after_forward: true\nAll you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.",
+    "objectID": "docs/multimodal.html#dataset-format",
+    "href": "docs/multimodal.html#dataset-format",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "Dataset Format",
+    "text": "Dataset Format\nFor multi-modal datasets, we adopt an extended chat_template format similar to OpenAI’s Message format.\n\nA message is a list of role and content.\nrole can be system, user, assistant, etc.\ncontent is a list of type and (text, image, path, url, base64, or audio).\n\n\nImage\n\n\n\n\n\n\nNote\n\n\n\nFor backwards compatibility:\n\nIf the dataset has a images or image column of list[Image], it will be appended to the first content list as {\"type\": \"image\", \"image\": ...}. However, if the content already has a {\"type\": \"image\"} but no image key, it will be set the image key.\nIf content is a string, it will be converted to a list with type as text.\n\n\n\nFor image loading, you can use the following keys within content alongside \"type\": \"image\":\n\n\"path\": \"/path/to/image.jpg\"\n\"url\": \"https://example.com/image.jpg\"\n\"base64\": \"...\"\n\"image\": PIL.Image\n\n\n\nAudio\nFor audio loading, you can use the following keys within content alongside \"type\": \"audio\":\n\n\"path\": \"/path/to/audio.mp3\"\n\"url\": \"https://example.com/audio.mp3\"\n\"audio\": np.ndarray\n\n\n\n\n\n\n\nTip\n\n\n\nYou may need to install librosa via pip3 install librosa==0.11.0.\n\n\n\n\nVideo\n\n\n\n\n\n\nWarning\n\n\n\nThis is not well tested at the moment. We welcome contributors!\n\n\nFor video loading, you can use the following keys within content alongside \"type\": \"video\":\n\n\"path\": \"/path/to/video.mp4\"\n\"url\": \"https://example.com/video.mp4\"\n\"video\": np.ndarray | list[PIL.Image.Image] | torch.Tensor (or list of the aforementioned)\n\n\n\nExample\nHere is an example of a multi-modal dataset:\n[\n  {\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}\n              ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"image\", \"url\": \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg\"},\n                {\"type\": \"text\", \"text\": \"Describe this image in detail.\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"The image is a bee.\"}\n            ]\n        }\n    ]\n  }\n]",
     "crumbs": [
-      "Deployments",
-      "Multi Node"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/multi-node.html#raytrain",
-    "href": "docs/multi-node.html#raytrain",
-    "title": "Multi Node",
-    "section": "Raytrain",
-    "text": "Raytrain\nPlease see ray train doc here.",
+    "objectID": "docs/multimodal.html#faq",
+    "href": "docs/multimodal.html#faq",
+    "title": "MultiModal / Vision Language Models (BETA)",
+    "section": "FAQ",
+    "text": "FAQ\n\nPIL.UnidentifiedImageError: cannot identify image file ...\n\nPIL could not retrieve the file at url using requests. Please check for typo. One alternative reason is that the request is blocked by the server.",
     "crumbs": [
-      "Deployments",
-      "Multi Node"
+      "How To Guides",
+      "MultiModal / Vision Language Models (BETA)"
     ]
   },
   {
-    "objectID": "docs/multi-node.html#torchrun",
-    "href": "docs/multi-node.html#torchrun",
-    "title": "Multi Node",
-    "section": "Torchrun",
-    "text": "Torchrun\nIf you are using Infiniband, we recommend torchrun to utilize the full bandwidth.\nSet the following env (change buffersize/socketname depending on your system):\nexport NCCL_IB_DISABLE=0\nexport NCCL_SOCKET_IFNAME=\"eth0,en,eth,em,bond\"\nexport NCCL_BUFFSIZE=2097152\nRun the following on each node:\n\nOption 1: New Axolotl CLI with launcher args (Recommended)\naxolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\"\n\n\nOption 2: Direct torchrun (Legacy)\ntorchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\" -m axolotl.cli.train config.yaml\nPlease make sure to substitute the placeholder variables:\n\nnum_nodes: Number of nodes (containing GPUs)\ngpu_per_node: Number of gpus per node\nhead_node_ip: IP of the head node (make sure other machines can connect to this)\nhead_node_port: Port of the head node (make sure other machines can connect to this. Default 29400)\nrdzv_id: A unique job ID that is used by the job across nodes.\n\nThe new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.\nMore info on the available configs can be found on the Pytorch docs here",
+    "objectID": "docs/dataset_loading.html",
+    "href": "docs/dataset_loading.html",
+    "title": "Dataset Loading",
+    "section": "",
+    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
     "crumbs": [
-      "Deployments",
-      "Multi Node"
+      "How To Guides",
+      "Dataset Loading"
     ]
   },
   {
-    "objectID": "docs/api/train.html",
-    "href": "docs/api/train.html",
-    "title": "train",
+    "objectID": "docs/dataset_loading.html#overview",
+    "href": "docs/dataset_loading.html#overview",
+    "title": "Dataset Loading",
     "section": "",
-    "text": "train\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(\n    cfg,\n    model,\n    tokenizer,\n    train_dataset,\n    safe_serialization,\n)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model, safe_serialization)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model, safe_serialization)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
+    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
   },
   {
-    "objectID": "docs/api/train.html#functions",
-    "href": "docs/api/train.html#functions",
-    "title": "train",
+    "objectID": "docs/dataset_loading.html#loading-datasets",
+    "href": "docs/dataset_loading.html#loading-datasets",
+    "title": "Dataset Loading",
+    "section": "Loading Datasets",
+    "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n  - path:\n    name:\n    data_files:\n    split:\n    revision:\n    trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config-reference.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n  - path: /path/to/your/dataset\n  - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nTo load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n  - path: data.json\n    ds_type: json\nIn the example above, it can be seen that we can just point the path to the file or directory along with the ds_type to load the dataset.\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n  - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n    # single file\n  - path: /path/to/your/directory\n    ds_type: csv\n    data_files: file1.csv\n\n    # multiple files\n  - path: /path/to/your/directory\n    ds_type: json\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n    # multiple files for parquet\n  - path: /path/to/your/directory\n    ds_type: parquet\n    data_files:\n      - file1.parquet\n      - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n  - path: org/dataset-name\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n  - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n    # Single file\n  - path: s3://bucket-name/path/to/your/file.jsonl\n\n    # Directory\n  - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n  - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_loading.html#next-steps",
+    "href": "docs/dataset_loading.html#next-steps",
+    "title": "Dataset Loading",
+    "section": "Next steps",
+    "text": "Next steps\nNow that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format dataset formats docs.",
+    "crumbs": [
+      "How To Guides",
+      "Dataset Loading"
+    ]
+  },
+  {
+    "objectID": "docs/api/integrations.spectrum.args.html",
+    "href": "docs/api/integrations.spectrum.args.html",
+    "title": "integrations.spectrum.args",
+    "section": "",
+    "text": "integrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
+  },
+  {
+    "objectID": "docs/api/integrations.spectrum.args.html#classes",
+    "href": "docs/api/integrations.spectrum.args.html#classes",
+    "title": "integrations.spectrum.args",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
+  },
+  {
+    "objectID": "docs/api/loaders.adapter.html",
+    "href": "docs/api/loaders.adapter.html",
+    "title": "loaders.adapter",
+    "section": "",
+    "text": "loaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+  },
+  {
+    "objectID": "docs/api/loaders.adapter.html#functions",
+    "href": "docs/api/loaders.adapter.html#functions",
+    "title": "loaders.adapter",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
+  },
+  {
+    "objectID": "docs/api/core.builders.base.html",
+    "href": "docs/api/core.builders.base.html",
+    "title": "core.builders.base",
+    "section": "",
+    "text": "core.builders.base\nBase class for trainer builder\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
+  },
+  {
+    "objectID": "docs/api/core.builders.base.html#classes",
+    "href": "docs/api/core.builders.base.html#classes",
+    "title": "core.builders.base",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
+  },
+  {
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
+    "title": "cli.merge_sharded_fsdp_weights",
+    "section": "",
+    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+  },
+  {
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
+    "title": "cli.merge_sharded_fsdp_weights",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
+  },
+  {
+    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
+    "title": "cli.merge_sharded_fsdp_weights",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
+  },
+  {
+    "objectID": "docs/api/cli.cloud.modal_.html",
+    "href": "docs/api/cli.cloud.modal_.html",
+    "title": "cli.cloud.modal_",
+    "section": "",
+    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
+  },
+  {
+    "objectID": "docs/api/cli.cloud.modal_.html#classes",
+    "href": "docs/api/cli.cloud.modal_.html#classes",
+    "title": "cli.cloud.modal_",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation."
+  },
+  {
+    "objectID": "docs/api/cli.cloud.modal_.html#functions",
+    "href": "docs/api/cli.cloud.modal_.html#functions",
+    "title": "cli.cloud.modal_",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
+  },
+  {
+    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
+    "title": "monkeypatch.stablelm_attn_hijack_flash",
+    "section": "",
+    "text": "monkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\n\n\n\nName\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
+  },
+  {
+    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.stablelm_attn_hijack_flash",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
+  },
+  {
+    "objectID": "docs/api/integrations.kd.trainer.html",
+    "href": "docs/api/integrations.kd.trainer.html",
+    "title": "integrations.kd.trainer",
+    "section": "",
+    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+  },
+  {
+    "objectID": "docs/api/integrations.kd.trainer.html#classes",
+    "href": "docs/api/integrations.kd.trainer.html#classes",
+    "title": "integrations.kd.trainer",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
+  },
+  {
+    "objectID": "docs/api/cli.delinearize_llama4.html",
+    "href": "docs/api/cli.delinearize_llama4.html",
+    "title": "cli.delinearize_llama4",
+    "section": "",
+    "text": "cli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
+  },
+  {
+    "objectID": "docs/api/cli.delinearize_llama4.html#functions",
+    "href": "docs/api/cli.delinearize_llama4.html#functions",
+    "title": "cli.delinearize_llama4",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
+  },
+  {
+    "objectID": "docs/api/cli.evaluate.html",
+    "href": "docs/api/cli.evaluate.html",
+    "title": "cli.evaluate",
+    "section": "",
+    "text": "cli.evaluate\nCLI to run evaluation on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
+  },
+  {
+    "objectID": "docs/api/cli.evaluate.html#functions",
+    "href": "docs/api/cli.evaluate.html#functions",
+    "title": "cli.evaluate",
+    "section": "",
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
+  },
+  {
+    "objectID": "docs/api/utils.schemas.model.html",
+    "href": "docs/api/utils.schemas.model.html",
+    "title": "utils.schemas.model",
+    "section": "",
+    "text": "utils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
+  },
+  {
+    "objectID": "docs/api/utils.schemas.model.html#classes",
+    "href": "docs/api/utils.schemas.model.html#classes",
+    "title": "utils.schemas.model",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(\n    cfg,\n    model,\n    tokenizer,\n    train_dataset,\n    safe_serialization,\n)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model, safe_serialization)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model, safe_serialization)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
+    "text": "Name\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
-    "href": "docs/api/prompt_strategies.kto.user_defined.html",
-    "title": "prompt_strategies.kto.user_defined",
+    "objectID": "docs/api/cli.art.html",
+    "href": "docs/api/cli.art.html",
+    "title": "cli.art",
     "section": "",
-    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
+    "text": "cli.art\nAxolotl ASCII logo utils.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
   },
   {
-    "objectID": "docs/api/common.datasets.html",
-    "href": "docs/api/common.datasets.html",
-    "title": "common.datasets",
+    "objectID": "docs/api/cli.art.html#functions",
+    "href": "docs/api/cli.art.html#functions",
+    "title": "cli.art",
     "section": "",
-    "text": "common.datasets\nDataset loading utilities.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
+    "text": "Name\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
   },
   {
-    "objectID": "docs/api/common.datasets.html#classes",
-    "href": "docs/api/common.datasets.html#classes",
-    "title": "common.datasets",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata."
+    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/common.datasets.html#functions",
-    "href": "docs/api/common.datasets.html#functions",
-    "title": "common.datasets",
+    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
+    "title": "prompt_strategies.dpo.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html",
-    "href": "docs/api/prompt_strategies.stepwise_supervised.html",
-    "title": "prompt_strategies.stepwise_supervised",
+    "objectID": "docs/api/core.chat.format.llama3x.html",
+    "href": "docs/api/core.chat.format.llama3x.html",
+    "title": "core.chat.format.llama3x",
     "section": "",
-    "text": "prompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\nand (optionally) per-step, or per-prompt-trace labels for reward modelling.\n\n\n\n\n\nName\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
+    "text": "core.chat.format.llama3x\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents"
   },
   {
-    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
-    "href": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
-    "title": "prompt_strategies.stepwise_supervised",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
+    "href": "docs/api/prompt_strategies.kto.chatml.html",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
+    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/cli.utils.args.html",
-    "href": "docs/api/cli.utils.args.html",
-    "title": "cli.utils.args",
+    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
+    "title": "prompt_strategies.kto.chatml",
     "section": "",
-    "text": "cli.utils.args\nUtilities for axolotl CLI args.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/cli.utils.args.html#functions",
-    "href": "docs/api/cli.utils.args.html#functions",
-    "title": "cli.utils.args",
+    "objectID": "docs/api/cli.args.html",
+    "href": "docs/api/cli.args.html",
+    "title": "cli.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
+    "text": "cli.args\nModule for axolotl CLI command arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=False,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
   },
   {
-    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html",
-    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html",
-    "title": "prompt_strategies.bradley_terry.llama3",
+    "objectID": "docs/api/cli.args.html#classes",
+    "href": "docs/api/cli.args.html#classes",
+    "title": "cli.args",
     "section": "",
-    "text": "prompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
+    "text": "Name\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=False,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
   },
   {
-    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
-    "title": "prompt_strategies.bradley_terry.llama3",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
+    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
+    "text": "monkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
   },
   {
-    "objectID": "docs/api/utils.schemas.peft.html",
-    "href": "docs/api/utils.schemas.peft.html",
-    "title": "utils.schemas.peft",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
+    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
     "section": "",
-    "text": "utils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
+    "text": "Name\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
   },
   {
-    "objectID": "docs/api/utils.schemas.peft.html#classes",
-    "href": "docs/api/utils.schemas.peft.html#classes",
-    "title": "utils.schemas.peft",
+    "objectID": "docs/api/evaluate.html",
+    "href": "docs/api/evaluate.html",
+    "title": "evaluate",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
+    "text": "evaluate\nModule for evaluating models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
   },
   {
-    "objectID": "docs/api/prompt_strategies.messages.chat.html",
-    "href": "docs/api/prompt_strategies.messages.chat.html",
-    "title": "prompt_strategies.messages.chat",
+    "objectID": "docs/api/evaluate.html#functions",
+    "href": "docs/api/evaluate.html#functions",
+    "title": "evaluate",
     "section": "",
-    "text": "prompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
+    "text": "Name\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
   },
   {
-    "objectID": "docs/api/prompt_strategies.messages.chat.html#classes",
-    "href": "docs/api/prompt_strategies.messages.chat.html#classes",
-    "title": "prompt_strategies.messages.chat",
+    "objectID": "docs/api/prompt_strategies.input_output.html",
+    "href": "docs/api/prompt_strategies.input_output.html",
+    "title": "prompt_strategies.input_output",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
+    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html",
-    "href": "docs/api/integrations.lm_eval.args.html",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
+    "href": "docs/api/prompt_strategies.input_output.html#classes",
+    "title": "prompt_strategies.input_output",
     "section": "",
-    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
   },
   {
-    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
-    "href": "docs/api/integrations.lm_eval.args.html#classes",
-    "title": "integrations.lm_eval.args",
+    "objectID": "docs/api/cli.vllm_serve.html",
+    "href": "docs/api/cli.vllm_serve.html",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
+    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
   },
   {
-    "objectID": "docs/api/cli.train.html",
-    "href": "docs/api/cli.train.html",
-    "title": "cli.train",
+    "objectID": "docs/api/cli.vllm_serve.html#classes",
+    "href": "docs/api/cli.vllm_serve.html#classes",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "cli.train\nCLI to run training on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server"
   },
   {
-    "objectID": "docs/api/cli.train.html#functions",
-    "href": "docs/api/cli.train.html#functions",
-    "title": "cli.train",
+    "objectID": "docs/api/cli.vllm_serve.html#functions",
+    "href": "docs/api/cli.vllm_serve.html#functions",
+    "title": "cli.vllm_serve",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
   },
   {
-    "objectID": "docs/api/utils.trainer.html",
-    "href": "docs/api/utils.trainer.html",
-    "title": "utils.trainer",
+    "objectID": "docs/api/cli.quantize.html",
+    "href": "docs/api/cli.quantize.html",
+    "title": "cli.quantize",
     "section": "",
-    "text": "utils.trainer\nModule containing the Trainer class and related functions\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
+    "text": "cli.quantize\nCLI to post-training quantize a model using torchao\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
   },
   {
-    "objectID": "docs/api/utils.trainer.html#functions",
-    "href": "docs/api/utils.trainer.html#functions",
-    "title": "utils.trainer",
+    "objectID": "docs/api/cli.quantize.html#functions",
+    "href": "docs/api/cli.quantize.html#functions",
+    "title": "cli.quantize",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
+    "text": "Name\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
   },
   {
-    "objectID": "docs/api/core.datasets.transforms.chat_builder.html",
-    "href": "docs/api/core.datasets.transforms.chat_builder.html",
-    "title": "core.datasets.transforms.chat_builder",
+    "objectID": "docs/api/cli.utils.sweeps.html",
+    "href": "docs/api/cli.utils.sweeps.html",
+    "title": "cli.utils.sweeps",
     "section": "",
-    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\ndataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "cli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
   },
   {
-    "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions",
-    "href": "docs/api/core.datasets.transforms.chat_builder.html#functions",
-    "title": "core.datasets.transforms.chat_builder",
+    "objectID": "docs/api/cli.utils.sweeps.html#functions",
+    "href": "docs/api/cli.utils.sweeps.html#functions",
+    "title": "cli.utils.sweeps",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
+    "text": "Name\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
   },
   {
-    "objectID": "docs/api/utils.callbacks.comet_.html",
-    "href": "docs/api/utils.callbacks.comet_.html",
-    "title": "utils.callbacks.comet_",
+    "objectID": "docs/api/core.builders.rl.html",
+    "href": "docs/api/core.builders.rl.html",
+    "title": "core.builders.rl",
     "section": "",
-    "text": "utils.callbacks.comet_\nComet module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
+    "text": "core.builders.rl\nBuilder for RLHF trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
   },
   {
-    "objectID": "docs/api/utils.callbacks.comet_.html#classes",
-    "href": "docs/api/utils.callbacks.comet_.html#classes",
-    "title": "utils.callbacks.comet_",
+    "objectID": "docs/api/core.builders.rl.html#classes",
+    "href": "docs/api/core.builders.rl.html#classes",
+    "title": "core.builders.rl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
+    "text": "Name\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html",
-    "href": "docs/api/prompt_tokenizers.html",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html",
+    "title": "prompt_strategies.dpo.chatml",
     "section": "",
-    "text": "prompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html#classes",
-    "href": "docs/api/prompt_tokenizers.html#classes",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
+    "title": "prompt_strategies.dpo.chatml",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
   },
   {
-    "objectID": "docs/api/prompt_tokenizers.html#functions",
-    "href": "docs/api/prompt_tokenizers.html#functions",
-    "title": "prompt_tokenizers",
+    "objectID": "docs/api/prompt_strategies.metharme.html",
+    "href": "docs/api/prompt_strategies.metharme.html",
+    "title": "prompt_strategies.metharme",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
+    "text": "prompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html",
-    "href": "docs/api/prompt_strategies.dpo.chat_template.html",
-    "title": "prompt_strategies.dpo.chat_template",
+    "objectID": "docs/api/prompt_strategies.metharme.html#classes",
+    "href": "docs/api/prompt_strategies.metharme.html#classes",
+    "title": "prompt_strategies.metharme",
     "section": "",
-    "text": "prompt_strategies.dpo.chat_template\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates."
+    "text": "Name\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
   },
   {
-    "objectID": "docs/api/core.trainers.utils.html",
-    "href": "docs/api/core.trainers.utils.html",
-    "title": "core.trainers.utils",
+    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
+    "title": "prompt_strategies.dpo.passthrough",
     "section": "",
-    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
+    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
   },
   {
-    "objectID": "docs/api/loaders.processor.html",
-    "href": "docs/api/loaders.processor.html",
-    "title": "loaders.processor",
+    "objectID": "docs/api/core.datasets.chat.html",
+    "href": "docs/api/core.datasets.chat.html",
+    "title": "core.datasets.chat",
     "section": "",
-    "text": "loaders.processor\nloaders.processor\nProcessor loading functionality for multi-modal models"
+    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.optimizer.html",
-    "href": "docs/api/core.trainers.mixins.optimizer.html",
-    "title": "core.trainers.mixins.optimizer",
+    "objectID": "docs/api/core.datasets.chat.html#classes",
+    "href": "docs/api/core.datasets.chat.html#classes",
+    "title": "core.datasets.chat",
     "section": "",
-    "text": "core.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
+    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.optimizer.html#classes",
-    "href": "docs/api/core.trainers.mixins.optimizer.html#classes",
-    "title": "core.trainers.mixins.optimizer",
+    "objectID": "docs/api/utils.distributed.html",
+    "href": "docs/api/utils.distributed.html",
+    "title": "utils.distributed",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
+    "text": "utils.distributed\nUtilities for distributed functionality.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
   },
   {
-    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
-    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "objectID": "docs/api/utils.distributed.html#functions",
+    "href": "docs/api/utils.distributed.html#functions",
+    "title": "utils.distributed",
     "section": "",
-    "text": "monkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+    "text": "Name\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
   },
   {
-    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
-    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
-    "title": "monkeypatch.data.batch_dataset_fetcher",
+    "objectID": "docs/api/integrations.base.html",
+    "href": "docs/api/integrations.base.html",
+    "title": "integrations.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
+    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/utils.callbacks.lisa.html",
-    "href": "docs/api/utils.callbacks.lisa.html",
-    "title": "utils.callbacks.lisa",
+    "objectID": "docs/api/integrations.base.html#classes",
+    "href": "docs/api/integrations.base.html#classes",
+    "title": "integrations.base",
     "section": "",
-    "text": "utils.callbacks.lisa\nutils.callbacks.lisa\nmodule for LISA\nAdapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl\nArxiv: https://arxiv.org/abs/2403.17919\nLicense: Apache 2.0"
+    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/kernels.utils.html",
-    "href": "docs/api/kernels.utils.html",
-    "title": "kernels.utils",
+    "objectID": "docs/api/integrations.base.html#functions",
+    "href": "docs/api/integrations.base.html#functions",
+    "title": "integrations.base",
     "section": "",
-    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
+    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
   },
   {
-    "objectID": "docs/api/cli.preprocess.html",
-    "href": "docs/api/cli.preprocess.html",
-    "title": "cli.preprocess",
+    "objectID": "docs/api/index.html",
+    "href": "docs/api/index.html",
+    "title": "API Reference",
     "section": "",
-    "text": "cli.preprocess\nCLI to run preprocessing of a dataset.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing dataset functionality.\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nFunctionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.streaming\nData handling specific to streaming datasets.\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
   },
   {
-    "objectID": "docs/api/cli.preprocess.html#functions",
-    "href": "docs/api/cli.preprocess.html#functions",
-    "title": "cli.preprocess",
+    "objectID": "docs/api/index.html#core",
+    "href": "docs/api/index.html#core",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
+    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing dataset functionality.\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the"
   },
   {
-    "objectID": "docs/api/integrations.liger.args.html",
-    "href": "docs/api/integrations.liger.args.html",
-    "title": "integrations.liger.args",
+    "objectID": "docs/api/index.html#cli",
+    "href": "docs/api/index.html#cli",
+    "title": "API Reference",
     "section": "",
-    "text": "integrations.liger.args\nModule for handling LIGER input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
+    "text": "Command-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command."
   },
   {
-    "objectID": "docs/api/integrations.liger.args.html#classes",
-    "href": "docs/api/integrations.liger.args.html#classes",
-    "title": "integrations.liger.args",
+    "objectID": "docs/api/index.html#trainers",
+    "href": "docs/api/index.html#trainers",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
+    "text": "Training implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers"
   },
   {
-    "objectID": "docs/api/utils.collators.core.html",
-    "href": "docs/api/utils.collators.core.html",
-    "title": "utils.collators.core",
+    "objectID": "docs/api/index.html#model-loading",
+    "href": "docs/api/index.html#model-loading",
+    "title": "API Reference",
     "section": "",
-    "text": "utils.collators.core\nutils.collators.core\nbasic shared collator constants"
+    "text": "Functionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module"
   },
   {
-    "objectID": "docs/api/cli.utils.load.html",
-    "href": "docs/api/cli.utils.load.html",
-    "title": "cli.utils.load",
+    "objectID": "docs/api/index.html#mixins",
+    "href": "docs/api/index.html#mixins",
+    "title": "API Reference",
     "section": "",
-    "text": "cli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+    "text": "Mixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin"
   },
   {
-    "objectID": "docs/api/cli.utils.load.html#functions",
-    "href": "docs/api/cli.utils.load.html#functions",
-    "title": "cli.utils.load",
+    "objectID": "docs/api/index.html#context-managers",
+    "href": "docs/api/index.html#context-managers",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
+    "text": "Context managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities"
   },
   {
-    "objectID": "docs/api/cli.utils.train.html",
-    "href": "docs/api/cli.utils.train.html",
-    "title": "cli.utils.train",
+    "objectID": "docs/api/index.html#prompt-strategies",
+    "href": "docs/api/index.html#prompt-strategies",
+    "title": "API Reference",
     "section": "",
-    "text": "cli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
+    "text": "Prompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template"
   },
   {
-    "objectID": "docs/api/cli.utils.train.html#functions",
-    "href": "docs/api/cli.utils.train.html#functions",
-    "title": "cli.utils.train",
+    "objectID": "docs/api/index.html#kernels",
+    "href": "docs/api/index.html#kernels",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
+    "text": "Low-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules."
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.sampler.html",
-    "href": "docs/api/core.trainers.grpo.sampler.html",
-    "title": "core.trainers.grpo.sampler",
+    "objectID": "docs/api/index.html#monkey-patches",
+    "href": "docs/api/index.html#monkey-patches",
+    "title": "API Reference",
     "section": "",
-    "text": "core.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\nhttps://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds\nsequence parallelism functionality; i.e., duplicating data across ranks in the same\nsequence parallel group.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
+    "text": "Runtime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching"
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.sampler.html#classes",
-    "href": "docs/api/core.trainers.grpo.sampler.html#classes",
-    "title": "core.trainers.grpo.sampler",
+    "objectID": "docs/api/index.html#utils",
+    "href": "docs/api/index.html#utils",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
+    "text": "Utility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.streaming\nData handling specific to streaming datasets.\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao."
   },
   {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html",
-    "href": "docs/api/utils.callbacks.mlflow_.html",
-    "title": "utils.callbacks.mlflow_",
+    "objectID": "docs/api/index.html#schemas",
+    "href": "docs/api/index.html#schemas",
+    "title": "API Reference",
     "section": "",
-    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
+    "text": "Pydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models"
   },
   {
-    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
-    "title": "utils.callbacks.mlflow_",
+    "objectID": "docs/api/index.html#integrations",
+    "href": "docs/api/index.html#integrations",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
+    "text": "Third-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments."
   },
   {
-    "objectID": "docs/api/utils.freeze.html",
-    "href": "docs/api/utils.freeze.html",
-    "title": "utils.freeze",
+    "objectID": "docs/api/index.html#common",
+    "href": "docs/api/index.html#common",
+    "title": "API Reference",
     "section": "",
-    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
+    "text": "Common utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities."
   },
   {
-    "objectID": "docs/api/utils.freeze.html#classes",
-    "href": "docs/api/utils.freeze.html#classes",
-    "title": "utils.freeze",
+    "objectID": "docs/api/index.html#models",
+    "href": "docs/api/index.html#models",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
+    "text": "Custom model implementations\n\n\n\nmodels.mamba.modeling_mamba"
   },
   {
-    "objectID": "docs/api/utils.freeze.html#functions",
-    "href": "docs/api/utils.freeze.html#functions",
-    "title": "utils.freeze",
+    "objectID": "docs/api/index.html#data-processing",
+    "href": "docs/api/index.html#data-processing",
+    "title": "API Reference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
+    "text": "Data processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences"
   },
   {
-    "objectID": "docs/api/kernels.geglu.html",
-    "href": "docs/api/kernels.geglu.html",
-    "title": "kernels.geglu",
+    "objectID": "docs/api/index.html#callbacks",
+    "href": "docs/api/index.html#callbacks",
+    "title": "API Reference",
     "section": "",
-    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "Training callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
   },
   {
-    "objectID": "docs/api/kernels.geglu.html#functions",
-    "href": "docs/api/kernels.geglu.html#functions",
-    "title": "kernels.geglu",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
+    "title": "monkeypatch.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "monkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\nName\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
-    "title": "prompt_strategies.dpo.zephyr",
+    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
+    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
+    "title": "monkeypatch.gradient_checkpointing.offload_disk",
     "section": "",
-    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
+    "text": "Name\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
   },
   {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
-    "href": "docs/api/prompt_strategies.llama2_chat.html",
-    "title": "prompt_strategies.llama2_chat",
+    "objectID": "docs/api/utils.quantization.html",
+    "href": "docs/api/utils.quantization.html",
+    "title": "utils.quantization",
     "section": "",
-    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+    "text": "utils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_qat_model_for_ptq\nThis function is used to convert a swap fake-quantized modules in a model\n\n\nget_ptq_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model_for_ptq\nThis function is used to quantize a model for post-training quantization.\n\n\n\n\n\nutils.quantization.convert_qat_model_for_ptq(model, *, quantize_embedding=None)\nThis function is used to convert a swap fake-quantized modules in a model\nwhich has been trained with QAT back to the original modules, ready for PTQ.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to convert.\nrequired\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone\n\n\n\n\n\n\n\nutils.quantization.get_ptq_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint\nThe group size to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model_for_ptq(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model for post-training quantization.\nIt swaps the model’s linear layers with fake quantized linear layers.\nIf quantize_embedding is True, it will also swap the model’s embedding weights with fake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
   },
   {
-    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
-    "title": "prompt_strategies.llama2_chat",
+    "objectID": "docs/api/utils.quantization.html#functions",
+    "href": "docs/api/utils.quantization.html#functions",
+    "title": "utils.quantization",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
+    "text": "Name\nDescription\n\n\n\n\nconvert_qat_model_for_ptq\nThis function is used to convert a swap fake-quantized modules in a model\n\n\nget_ptq_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model_for_ptq\nThis function is used to quantize a model for post-training quantization.\n\n\n\n\n\nutils.quantization.convert_qat_model_for_ptq(model, *, quantize_embedding=None)\nThis function is used to convert a swap fake-quantized modules in a model\nwhich has been trained with QAT back to the original modules, ready for PTQ.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to convert.\nrequired\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone\n\n\n\n\n\n\n\nutils.quantization.get_ptq_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint\nThe group size to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model_for_ptq(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model for post-training quantization.\nIt swaps the model’s linear layers with fake quantized linear layers.\nIf quantize_embedding is True, it will also swap the model’s embedding weights with fake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
   },
   {
-    "objectID": "docs/api/cli.config.html",
-    "href": "docs/api/cli.config.html",
-    "title": "cli.config",
+    "objectID": "docs/api/utils.dict.html",
+    "href": "docs/api/utils.dict.html",
+    "title": "utils.dict",
     "section": "",
-    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
   },
   {
-    "objectID": "docs/api/cli.config.html#functions",
-    "href": "docs/api/cli.config.html#functions",
-    "title": "cli.config",
+    "objectID": "docs/api/utils.dict.html#classes",
+    "href": "docs/api/utils.dict.html#classes",
+    "title": "utils.dict",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
   },
   {
-    "objectID": "docs/api/kernels.quantize.html",
-    "href": "docs/api/kernels.quantize.html",
-    "title": "kernels.quantize",
+    "objectID": "docs/api/utils.dict.html#functions",
+    "href": "docs/api/utils.dict.html#functions",
+    "title": "utils.dict",
     "section": "",
-    "text": "kernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
+    "text": "Name\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
   },
   {
-    "objectID": "docs/api/kernels.quantize.html#functions",
-    "href": "docs/api/kernels.quantize.html#functions",
-    "title": "kernels.quantize",
+    "objectID": "docs/api/core.trainers.dpo.trainer.html",
+    "href": "docs/api/core.trainers.dpo.trainer.html",
+    "title": "core.trainers.dpo.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
+    "text": "core.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html",
-    "href": "docs/api/monkeypatch.lora_kernels.html",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/core.trainers.dpo.trainer.html#classes",
+    "href": "docs/api/core.trainers.dpo.trainer.html#classes",
+    "title": "core.trainers.dpo.trainer",
     "section": "",
-    "text": "monkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\n\n\n\nName\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html#classes",
-    "href": "docs/api/monkeypatch.lora_kernels.html#classes",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/utils.schemas.config.html",
+    "href": "docs/api/utils.schemas.config.html",
+    "title": "utils.schemas.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching"
+    "text": "utils.schemas.config\nModule with Pydantic models for configuration.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nwrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nwrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
   },
   {
-    "objectID": "docs/api/monkeypatch.lora_kernels.html#functions",
-    "href": "docs/api/monkeypatch.lora_kernels.html#functions",
-    "title": "monkeypatch.lora_kernels",
+    "objectID": "docs/api/utils.schemas.config.html#classes",
+    "href": "docs/api/utils.schemas.config.html#classes",
+    "title": "utils.schemas.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nwrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nwrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
   },
   {
-    "objectID": "docs/api/common.const.html",
-    "href": "docs/api/common.const.html",
-    "title": "common.const",
+    "objectID": "docs/api/prompt_strategies.orcamini.html",
+    "href": "docs/api/prompt_strategies.orcamini.html",
+    "title": "prompt_strategies.orcamini",
     "section": "",
-    "text": "common.const\ncommon.const\nVarious shared constants"
+    "text": "prompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\nsee also https://huggingface.co/psmathur/orca_mini_v2_7b for more information\nUse dataset type: orcamini in conig.yml to use this prompt style.\nCompared to the alpaca_w_system.open_orca dataset type,\nthis one specifies the system prompt with “### System:”.\nNot suited/tested for multiple-turn conversations without further adjustments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/prompt_strategies.orcamini.html#classes",
+    "href": "docs/api/prompt_strategies.orcamini.html#classes",
+    "title": "prompt_strategies.orcamini",
     "section": "",
-    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "Name\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
   },
   {
-    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
-    "title": "integrations.cut_cross_entropy.args",
+    "objectID": "docs/api/utils.collators.mamba.html",
+    "href": "docs/api/utils.collators.mamba.html",
+    "title": "utils.collators.mamba",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
+    "text": "utils.collators.mamba\ncollators for Mamba\n\n\n\n\n\nName\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/utils.collators.mamba.html#classes",
+    "href": "docs/api/utils.collators.mamba.html#classes",
+    "title": "utils.collators.mamba",
     "section": "",
-    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "Name\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
-    "title": "prompt_strategies.alpaca_w_system",
+    "objectID": "docs/api/datasets.html",
+    "href": "docs/api/datasets.html",
+    "title": "datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
+    "text": "datasets\nModule containing dataset functionality.\nWe want this to be a wrapper for an existing dataset that we have loaded. Lets use the\nconcept of middlewares to wrap each dataset. We’ll use the collators later on to pad the\ndatasets.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html",
-    "href": "docs/api/prompt_strategies.chat_template.html",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/datasets.html#classes",
+    "href": "docs/api/datasets.html#classes",
+    "title": "datasets",
     "section": "",
-    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "Name\nDescription\n\n\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
   },
   {
-    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
-    "href": "docs/api/prompt_strategies.chat_template.html#classes",
-    "title": "prompt_strategies.chat_template",
+    "objectID": "docs/api/convert.html",
+    "href": "docs/api/convert.html",
+    "title": "convert",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
+    "text": "convert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
   },
   {
-    "objectID": "docs/api/utils.schemas.datasets.html",
-    "href": "docs/api/utils.schemas.datasets.html",
-    "title": "utils.schemas.datasets",
+    "objectID": "docs/api/convert.html#classes",
+    "href": "docs/api/convert.html#classes",
+    "title": "convert",
     "section": "",
-    "text": "utils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
+    "text": "Name\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
   },
   {
-    "objectID": "docs/api/utils.schemas.datasets.html#classes",
-    "href": "docs/api/utils.schemas.datasets.html#classes",
-    "title": "utils.schemas.datasets",
+    "objectID": "docs/api/prompt_strategies.kto.llama3.html",
+    "href": "docs/api/prompt_strategies.kto.llama3.html",
+    "title": "prompt_strategies.kto.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
+    "text": "prompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/core.training_args.html",
-    "href": "docs/api/core.training_args.html",
-    "title": "core.training_args",
+    "objectID": "docs/api/prompt_strategies.kto.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.kto.llama3.html#functions",
+    "title": "prompt_strategies.kto.llama3",
     "section": "",
-    "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
+    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
   },
   {
-    "objectID": "docs/api/core.training_args.html#classes",
-    "href": "docs/api/core.training_args.html#classes",
-    "title": "core.training_args",
+    "objectID": "docs/api/utils.tokenization.html",
+    "href": "docs/api/utils.tokenization.html",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
+    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/utils.tokenization.html#functions",
+    "href": "docs/api/utils.tokenization.html#functions",
+    "title": "utils.tokenization",
     "section": "",
-    "text": "prompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\n\n\n\nName\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
+    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/core.trainers.base.html",
+    "href": "docs/api/core.trainers.base.html",
+    "title": "core.trainers.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels"
+    "text": "core.trainers.base\nModule for customized trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
-    "href": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
-    "title": "prompt_strategies.orpo.chat_template",
+    "objectID": "docs/api/core.trainers.base.html#classes",
+    "href": "docs/api/core.trainers.base.html#classes",
+    "title": "core.trainers.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details."
   },
   {
-    "objectID": "docs/api/cli.utils.html",
-    "href": "docs/api/cli.utils.html",
-    "title": "cli.utils",
+    "objectID": "docs/api/monkeypatch.llama_expand_mask.html",
+    "href": "docs/api/monkeypatch.llama_expand_mask.html",
+    "title": "monkeypatch.llama_expand_mask",
     "section": "",
-    "text": "cli.utils\ncli.utils\nInit for axolotl.cli.utils module."
+    "text": "monkeypatch.llama_expand_mask\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf"
   },
   {
-    "objectID": "docs/api/common.architectures.html",
-    "href": "docs/api/common.architectures.html",
-    "title": "common.architectures",
+    "objectID": "docs/api/loaders.model.html",
+    "href": "docs/api/loaders.model.html",
+    "title": "loaders.model",
     "section": "",
-    "text": "common.architectures\ncommon.architectures\nCommon architecture specific constants"
+    "text": "loaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html",
-    "href": "docs/api/core.trainers.grpo.trainer.html",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/loaders.model.html#classes",
+    "href": "docs/api/loaders.model.html#classes",
+    "title": "loaders.model",
     "section": "",
-    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
   },
   {
-    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
-    "title": "core.trainers.grpo.trainer",
+    "objectID": "docs/api/logging_config.html",
+    "href": "docs/api/logging_config.html",
+    "title": "logging_config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
+    "text": "logging_config\nCommon logging module for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlLogger\nA Logger that automatically rejects non-axolotl INFOs.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nA Logger that automatically rejects non-axolotl INFOs.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\nAllows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL)\nDrops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default)\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
   },
   {
-    "objectID": "docs/api/prompt_strategies.completion.html",
-    "href": "docs/api/prompt_strategies.completion.html",
-    "title": "prompt_strategies.completion",
+    "objectID": "docs/api/logging_config.html#classes",
+    "href": "docs/api/logging_config.html#classes",
+    "title": "logging_config",
     "section": "",
-    "text": "prompt_strategies.completion\nBasic completion text\n\n\n\n\n\nName\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlLogger\nA Logger that automatically rejects non-axolotl INFOs.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nA Logger that automatically rejects non-axolotl INFOs.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\nAllows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL)\nDrops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default)\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
   },
   {
-    "objectID": "docs/api/prompt_strategies.completion.html#classes",
-    "href": "docs/api/prompt_strategies.completion.html#classes",
-    "title": "prompt_strategies.completion",
+    "objectID": "docs/api/logging_config.html#functions",
+    "href": "docs/api/logging_config.html#functions",
+    "title": "logging_config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
+    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
   },
   {
-    "objectID": "docs/api/utils.collators.batching.html",
-    "href": "docs/api/utils.collators.batching.html",
-    "title": "utils.collators.batching",
+    "objectID": "docs/api/core.chat.format.shared.html",
+    "href": "docs/api/core.chat.format.shared.html",
+    "title": "core.chat.format.shared",
     "section": "",
-    "text": "utils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\n\n\n\nName\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
+    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
   },
   {
-    "objectID": "docs/api/utils.collators.batching.html#classes",
-    "href": "docs/api/utils.collators.batching.html#classes",
-    "title": "utils.collators.batching",
+    "objectID": "docs/api/models.mamba.modeling_mamba.html",
+    "href": "docs/api/models.mamba.modeling_mamba.html",
+    "title": "models.mamba.modeling_mamba",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
+    "text": "models.mamba.modeling_mamba\nmodels.mamba.modeling_mamba"
   },
   {
-    "objectID": "docs/api/kernels.lora.html",
-    "href": "docs/api/kernels.lora.html",
-    "title": "kernels.lora",
+    "objectID": "docs/api/utils.schemas.enums.html",
+    "href": "docs/api/utils.schemas.enums.html",
+    "title": "utils.schemas.enums",
     "section": "",
-    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
+    "text": "utils.schemas.enums\nEnums for Axolotl input config\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\nTorchIntDType\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations\n\n\n\nutils.schemas.enums.TorchIntDType()\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4"
   },
   {
-    "objectID": "docs/api/kernels.lora.html#classes",
-    "href": "docs/api/kernels.lora.html#classes",
-    "title": "kernels.lora",
+    "objectID": "docs/api/utils.schemas.enums.html#classes",
+    "href": "docs/api/utils.schemas.enums.html#classes",
+    "title": "utils.schemas.enums",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors"
+    "text": "Name\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\nTorchIntDType\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations\n\n\n\nutils.schemas.enums.TorchIntDType()\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4"
   },
   {
-    "objectID": "docs/api/kernels.lora.html#functions",
-    "href": "docs/api/kernels.lora.html#functions",
-    "title": "kernels.lora",
+    "objectID": "docs/api/utils.callbacks.profiler.html",
+    "href": "docs/api/utils.callbacks.profiler.html",
+    "title": "utils.callbacks.profiler",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
+    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
   },
   {
-    "objectID": "docs/api/utils.schedulers.html",
-    "href": "docs/api/utils.schedulers.html",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
+    "href": "docs/api/utils.callbacks.profiler.html#classes",
+    "title": "utils.callbacks.profiler",
     "section": "",
-    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#classes",
-    "href": "docs/api/utils.schedulers.html#classes",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/utils.bench.html",
+    "href": "docs/api/utils.bench.html",
+    "title": "utils.bench",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
+    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/utils.schedulers.html#functions",
-    "href": "docs/api/utils.schedulers.html#functions",
-    "title": "utils.schedulers",
+    "objectID": "docs/api/utils.bench.html#functions",
+    "href": "docs/api/utils.bench.html#functions",
+    "title": "utils.bench",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
+    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
   },
   {
-    "objectID": "docs/api/monkeypatch.relora.html",
-    "href": "docs/api/monkeypatch.relora.html",
-    "title": "monkeypatch.relora",
+    "objectID": "docs/api/utils.samplers.multipack.html",
+    "href": "docs/api/utils.samplers.multipack.html",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "monkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\n\n\n\nName\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
+    "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    bin_size=200,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
   },
   {
-    "objectID": "docs/api/monkeypatch.relora.html#classes",
-    "href": "docs/api/monkeypatch.relora.html#classes",
-    "title": "monkeypatch.relora",
+    "objectID": "docs/api/utils.samplers.multipack.html#classes",
+    "href": "docs/api/utils.samplers.multipack.html#classes",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
+    "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    bin_size=200,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs"
   },
   {
-    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html",
-    "href": "docs/api/monkeypatch.transformers_fa_utils.html",
-    "title": "monkeypatch.transformers_fa_utils",
+    "objectID": "docs/api/utils.samplers.multipack.html#functions",
+    "href": "docs/api/utils.samplers.multipack.html#functions",
+    "title": "utils.samplers.multipack",
     "section": "",
-    "text": "monkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\n\n\n\nName\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
+    "text": "Name\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
   },
   {
-    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
-    "href": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
-    "title": "monkeypatch.transformers_fa_utils",
+    "objectID": "docs/api/core.trainers.mamba.html",
+    "href": "docs/api/core.trainers.mamba.html",
+    "title": "core.trainers.mamba",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
+    "text": "core.trainers.mamba\nModule for mamba trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
   },
   {
-    "objectID": "docs/api/loaders.constants.html",
-    "href": "docs/api/loaders.constants.html",
-    "title": "loaders.constants",
+    "objectID": "docs/api/core.trainers.mamba.html#classes",
+    "href": "docs/api/core.trainers.mamba.html#classes",
+    "title": "core.trainers.mamba",
     "section": "",
-    "text": "loaders.constants\nloaders.constants\nShared constants for axolotl.loaders module"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
-    "title": "prompt_strategies.alpaca_instruct",
+    "objectID": "docs/api/prompt_strategies.alpaca_chat.html",
+    "href": "docs/api/prompt_strategies.alpaca_chat.html",
+    "title": "prompt_strategies.alpaca_chat",
     "section": "",
-    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
+    "text": "prompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
   },
   {
-    "objectID": "docs/api/cli.merge_lora.html",
-    "href": "docs/api/cli.merge_lora.html",
-    "title": "cli.merge_lora",
+    "objectID": "docs/api/prompt_strategies.alpaca_chat.html#classes",
+    "href": "docs/api/prompt_strategies.alpaca_chat.html#classes",
+    "title": "prompt_strategies.alpaca_chat",
     "section": "",
-    "text": "cli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
   },
   {
-    "objectID": "docs/api/cli.merge_lora.html#functions",
-    "href": "docs/api/cli.merge_lora.html#functions",
-    "title": "cli.merge_lora",
+    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
+    "title": "monkeypatch.btlm_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
+    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
   },
   {
     "objectID": "docs/api/core.chat.messages.html",
@@ -1173,450 +1438,662 @@
     "text": "monkeypatch.mixtral\nmonkeypatch.mixtral\nPatches to support multipack for mixtral"
   },
   {
-    "objectID": "docs/api/utils.schemas.trl.html",
-    "href": "docs/api/utils.schemas.trl.html",
-    "title": "utils.schemas.trl",
+    "objectID": "docs/api/core.trainers.mixins.scheduler.html",
+    "href": "docs/api/core.trainers.mixins.scheduler.html",
+    "title": "core.trainers.mixins.scheduler",
     "section": "",
-    "text": "utils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
+    "text": "core.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
   },
   {
-    "objectID": "docs/api/utils.schemas.trl.html#classes",
-    "href": "docs/api/utils.schemas.trl.html#classes",
-    "title": "utils.schemas.trl",
+    "objectID": "docs/api/core.trainers.mixins.scheduler.html#classes",
+    "href": "docs/api/core.trainers.mixins.scheduler.html#classes",
+    "title": "core.trainers.mixins.scheduler",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
+    "text": "Name\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
   },
   {
-    "objectID": "docs/api/core.builders.causal.html",
-    "href": "docs/api/core.builders.causal.html",
-    "title": "core.builders.causal",
+    "objectID": "docs/api/loaders.patch_manager.html",
+    "href": "docs/api/loaders.patch_manager.html",
+    "title": "loaders.patch_manager",
     "section": "",
-    "text": "core.builders.causal\nBuilder for causal trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
+    "text": "loaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\nApplies pre- and post-model load patches for various fixes and optimizations.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config."
   },
   {
-    "objectID": "docs/api/core.builders.causal.html#classes",
-    "href": "docs/api/core.builders.causal.html#classes",
-    "title": "core.builders.causal",
+    "objectID": "docs/api/loaders.patch_manager.html#classes",
+    "href": "docs/api/loaders.patch_manager.html#classes",
+    "title": "loaders.patch_manager",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
+    "text": "Name\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config."
   },
   {
-    "objectID": "docs/api/cli.utils.fetch.html",
-    "href": "docs/api/cli.utils.fetch.html",
-    "title": "cli.utils.fetch",
+    "objectID": "docs/api/utils.optimizers.adopt.html",
+    "href": "docs/api/utils.optimizers.adopt.html",
+    "title": "utils.optimizers.adopt",
     "section": "",
-    "text": "cli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
+    "text": "utils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\nADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)\nTaniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka\n\n\n\n\n\nName\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
   },
   {
-    "objectID": "docs/api/cli.utils.fetch.html#functions",
-    "href": "docs/api/cli.utils.fetch.html#functions",
-    "title": "cli.utils.fetch",
+    "objectID": "docs/api/utils.optimizers.adopt.html#functions",
+    "href": "docs/api/utils.optimizers.adopt.html#functions",
+    "title": "utils.optimizers.adopt",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
+    "text": "Name\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
   },
   {
-    "objectID": "docs/api/monkeypatch.utils.html",
-    "href": "docs/api/monkeypatch.utils.html",
-    "title": "monkeypatch.utils",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
+    "title": "monkeypatch.llama_attn_hijack_xformers",
     "section": "",
-    "text": "monkeypatch.utils\nShared utils for the monkeypatches\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
+    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
   },
   {
-    "objectID": "docs/api/monkeypatch.utils.html#functions",
-    "href": "docs/api/monkeypatch.utils.html#functions",
-    "title": "monkeypatch.utils",
+    "objectID": "docs/api/utils.callbacks.perplexity.html",
+    "href": "docs/api/utils.callbacks.perplexity.html",
+    "title": "utils.callbacks.perplexity",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
+    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
   },
   {
-    "objectID": "docs/api/utils.chat_templates.html",
-    "href": "docs/api/utils.chat_templates.html",
-    "title": "utils.chat_templates",
+    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
+    "href": "docs/api/utils.callbacks.perplexity.html#classes",
+    "title": "utils.callbacks.perplexity",
     "section": "",
-    "text": "utils.chat_templates\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation."
+    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
   },
   {
-    "objectID": "docs/api/utils.data.sft.html",
-    "href": "docs/api/utils.data.sft.html",
-    "title": "utils.data.sft",
+    "objectID": "docs/api/loaders.tokenizer.html",
+    "href": "docs/api/loaders.tokenizer.html",
+    "title": "loaders.tokenizer",
     "section": "",
-    "text": "utils.data.sft\nData handling specific to SFT.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(\n    cfg,\n    tokenizer,\n    processor=None,\n    preprocess_iterable=False,\n)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\npreprocess_iterable\nbool\nWhether to use iterable preprocessing.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
+    "text": "loaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
   },
   {
-    "objectID": "docs/api/utils.data.sft.html#functions",
-    "href": "docs/api/utils.data.sft.html#functions",
-    "title": "utils.data.sft",
+    "objectID": "docs/api/loaders.tokenizer.html#functions",
+    "href": "docs/api/loaders.tokenizer.html#functions",
+    "title": "loaders.tokenizer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(\n    cfg,\n    tokenizer,\n    processor=None,\n    preprocess_iterable=False,\n)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\npreprocess_iterable\nbool\nWhether to use iterable preprocessing.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
+    "text": "Name\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
   },
   {
-    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
-    "title": "monkeypatch.mistral_attn_hijack_flash",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html",
+    "href": "docs/api/prompt_strategies.pygmalion.html",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "monkeypatch.mistral_attn_hijack_flash\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model"
+    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html",
-    "href": "docs/api/utils.callbacks.qat.html",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
+    "title": "prompt_strategies.pygmalion",
     "section": "",
-    "text": "utils.callbacks.qat\nQAT Callback for HF Causal Trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html#classes",
-    "href": "docs/api/utils.callbacks.qat.html#classes",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/utils.lora.html",
+    "href": "docs/api/utils.lora.html",
+    "title": "utils.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model."
+    "text": "utils.lora\nmodule to get the state dict of a merged lora model\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
   },
   {
-    "objectID": "docs/api/utils.callbacks.qat.html#functions",
-    "href": "docs/api/utils.callbacks.qat.html#functions",
-    "title": "utils.callbacks.qat",
+    "objectID": "docs/api/utils.lora.html#functions",
+    "href": "docs/api/utils.lora.html#functions",
+    "title": "utils.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
   },
   {
-    "objectID": "docs/api/cli.checks.html",
-    "href": "docs/api/cli.checks.html",
-    "title": "cli.checks",
+    "objectID": "docs/api/prompt_strategies.user_defined.html",
+    "href": "docs/api/prompt_strategies.user_defined.html",
+    "title": "prompt_strategies.user_defined",
     "section": "",
-    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "prompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\n\n\n\nName\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
   },
   {
-    "objectID": "docs/api/cli.checks.html#functions",
-    "href": "docs/api/cli.checks.html#functions",
-    "title": "cli.checks",
+    "objectID": "docs/api/prompt_strategies.user_defined.html#classes",
+    "href": "docs/api/prompt_strategies.user_defined.html#classes",
+    "title": "prompt_strategies.user_defined",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
+    "text": "Name\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
   },
   {
-    "objectID": "docs/api/utils.schemas.utils.html",
-    "href": "docs/api/utils.schemas.utils.html",
-    "title": "utils.schemas.utils",
+    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html",
+    "href": "docs/api/core.trainers.mixins.rng_state_loader.html",
+    "title": "core.trainers.mixins.rng_state_loader",
     "section": "",
-    "text": "utils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
+    "text": "core.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\nSee https://github.com/huggingface/transformers/pull/37162\nTODO: Remove when upstream added PR to release\n\n\n\n\n\nName\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
   },
   {
-    "objectID": "docs/api/utils.schemas.utils.html#functions",
-    "href": "docs/api/utils.schemas.utils.html#functions",
-    "title": "utils.schemas.utils",
+    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
+    "href": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
+    "title": "core.trainers.mixins.rng_state_loader",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
+    "text": "Name\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
   },
   {
-    "objectID": "docs/api/utils.schemas.integrations.html",
-    "href": "docs/api/utils.schemas.integrations.html",
-    "title": "utils.schemas.integrations",
+    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
+    "title": "prompt_strategies.dpo.user_defined",
     "section": "",
-    "text": "utils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\n\n\n\nName\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
+    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
   },
   {
-    "objectID": "docs/api/utils.schemas.integrations.html#classes",
-    "href": "docs/api/utils.schemas.integrations.html#classes",
-    "title": "utils.schemas.integrations",
+    "objectID": "docs/api/kernels.swiglu.html",
+    "href": "docs/api/kernels.swiglu.html",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
+    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/utils.schemas.training.html",
-    "href": "docs/api/utils.schemas.training.html",
-    "title": "utils.schemas.training",
+    "objectID": "docs/api/kernels.swiglu.html#functions",
+    "href": "docs/api/kernels.swiglu.html#functions",
+    "title": "kernels.swiglu",
     "section": "",
-    "text": "utils.schemas.training\nPydantic models for training hyperparameters\n\n\n\n\n\nName\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
+    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/utils.schemas.training.html#classes",
-    "href": "docs/api/utils.schemas.training.html#classes",
-    "title": "utils.schemas.training",
+    "objectID": "docs/api/utils.schemas.multimodal.html",
+    "href": "docs/api/utils.schemas.multimodal.html",
+    "title": "utils.schemas.multimodal",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
+    "text": "utils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
   },
   {
-    "objectID": "docs/api/integrations.grokfast.optimizer.html",
-    "href": "docs/api/integrations.grokfast.optimizer.html",
-    "title": "integrations.grokfast.optimizer",
+    "objectID": "docs/api/utils.schemas.multimodal.html#classes",
+    "href": "docs/api/utils.schemas.multimodal.html#classes",
+    "title": "utils.schemas.multimodal",
     "section": "",
-    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
+    "text": "Name\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
-    "title": "monkeypatch.llama_attn_hijack_flash",
+    "objectID": "docs/api/cli.main.html",
+    "href": "docs/api/cli.main.html",
+    "title": "cli.main",
     "section": "",
-    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
+    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.llama_attn_hijack_flash",
+    "objectID": "docs/api/cli.main.html#functions",
+    "href": "docs/api/cli.main.html#functions",
+    "title": "cli.main",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
+    "text": "Name\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
   },
   {
-    "objectID": "docs/api/utils.model_shard_quant.html",
-    "href": "docs/api/utils.model_shard_quant.html",
-    "title": "utils.model_shard_quant",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
+    "text": "utils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
   },
   {
-    "objectID": "docs/api/utils.model_shard_quant.html#functions",
-    "href": "docs/api/utils.model_shard_quant.html#functions",
-    "title": "utils.model_shard_quant",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
+    "text": "Name\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.collators.mm_chat.html",
-    "href": "docs/api/utils.collators.mm_chat.html",
-    "title": "utils.collators.mm_chat",
+    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
+    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
+    "title": "utils.ctx_managers.sequence_parallel",
     "section": "",
-    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+    "text": "Name\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
   },
   {
-    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
-    "href": "docs/api/utils.collators.mm_chat.html#classes",
-    "title": "utils.collators.mm_chat",
+    "objectID": "docs/api/monkeypatch.multipack.html",
+    "href": "docs/api/monkeypatch.multipack.html",
+    "title": "monkeypatch.multipack",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
+    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
   },
   {
-    "objectID": "docs/api/cli.inference.html",
-    "href": "docs/api/cli.inference.html",
-    "title": "cli.inference",
+    "objectID": "docs/api/core.chat.format.chatml.html",
+    "href": "docs/api/core.chat.format.chatml.html",
+    "title": "core.chat.format.chatml",
     "section": "",
-    "text": "cli.inference\nCLI to run inference on a trained model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat template\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat template\nis (optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+    "text": "core.chat.format.chatml\ncore.chat.format.chatml\nChatML transformation functions for MessageContents"
   },
   {
-    "objectID": "docs/api/cli.inference.html#functions",
-    "href": "docs/api/cli.inference.html#functions",
-    "title": "cli.inference",
+    "objectID": "docs/api/cli.cloud.base.html",
+    "href": "docs/api/cli.cloud.base.html",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat template\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat template\nis (optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
+    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/cli.cloud.base.html#classes",
+    "href": "docs/api/cli.cloud.base.html#classes",
+    "title": "cli.cloud.base",
     "section": "",
-    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
   },
   {
-    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
-    "title": "monkeypatch.trainer_fsdp_optim",
+    "objectID": "docs/api/core.trainers.trl.html",
+    "href": "docs/api/core.trainers.trl.html",
+    "title": "core.trainers.trl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
+    "text": "core.trainers.trl\nModule for TRL RL trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/amd_hpc.html",
-    "href": "docs/amd_hpc.html",
-    "title": "AMD GPUs on HPC Systems",
+    "objectID": "docs/api/core.trainers.trl.html#classes",
+    "href": "docs/api/core.trainers.trl.html#classes",
+    "title": "core.trainers.trl",
     "section": "",
-    "text": "This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.",
+    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+  },
+  {
+    "objectID": "docs/mixed_precision.html",
+    "href": "docs/mixed_precision.html",
+    "title": "Mixed Precision Training",
+    "section": "",
+    "text": "Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats:",
+    "crumbs": [
+      "Core Concepts",
+      "Mixed Precision Training"
+    ]
+  },
+  {
+    "objectID": "docs/mixed_precision.html#sec-fp16",
+    "href": "docs/mixed_precision.html#sec-fp16",
+    "title": "Mixed Precision Training",
+    "section": "1 FP16 Mixed Precision",
+    "text": "1 FP16 Mixed Precision\n\n1.1 Overview\nFP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16.\n\n\n1.2 Configuration\nfp16: true\n\n\n1.3 FP16 Considerations\n\nMay require gradient scaling to prevent underflow\nLess numerically stable than BF16\nCan cause training instability with some model architectures\nConsider using BF16 if your hardware supports it",
+    "crumbs": [
+      "Core Concepts",
+      "Mixed Precision Training"
+    ]
+  },
+  {
+    "objectID": "docs/mixed_precision.html#sec-bf16",
+    "href": "docs/mixed_precision.html#sec-bf16",
+    "title": "Mixed Precision Training",
+    "section": "2 BF16 Mixed Precision",
+    "text": "2 BF16 Mixed Precision\n\n2.1 Overview\nBF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory.\n\n\n2.2 Configuration\n# Automatic BF16 detection (recommended)\nbf16: auto\n\n# Or explicitly enable\nbf16: true\n\n# For evaluation with BF16\nbf16: full  # Equivalent to bf16_full_eval in the HF trainer",
+    "crumbs": [
+      "Core Concepts",
+      "Mixed Precision Training"
+    ]
+  },
+  {
+    "objectID": "docs/mixed_precision.html#sec-fp8",
+    "href": "docs/mixed_precision.html#sec-fp8",
+    "title": "Mixed Precision Training",
+    "section": "3 FP8 Mixed Precision",
+    "text": "3 FP8 Mixed Precision\n\n\n\n\n\n\nNote\n\n\n\nFP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO.\n\n\n\n3.1 What is FP8?\nFP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl’s implementation uses PyTorch’s TorchAO library with “tensorwise” scaling strategy.\n\n\n3.2 Requirements\n\nHopper+ GPUs (H100/H200)\nPyTorch 2.7+ (+ compatible TorchAO version)\nCUDA 12.4+\n\n\n\n3.3 Configuration\nAdd to your YAML config:\n# Enable FP8 mixed precision\nfp8: true\n\n# Optional: Enable FP8 for FSDP all-gather operations\nfp8_enable_fsdp_float8_all_gather: true\n\n# Enable torch.compile (almost always necessary for FP8 speedups)\ntorch_compile: true\n\n\n\n\n\n\nImportant\n\n\n\ntorch.compile is critical for FP8 performance\nFP8 training requires torch_compile: true to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16.\n\n\n\n\n3.4 Advanced FP8 Configs\nFor FSDP (Fully Sharded Data Parallel) training:\nfp8: true\nfp8_enable_fsdp_float8_all_gather: true\n\ntorch_compile: true\n\n# FSDP configuration\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: LlamaDecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true",
+    "crumbs": [
+      "Core Concepts",
+      "Mixed Precision Training"
+    ]
+  },
+  {
+    "objectID": "docs/mixed_precision.html#sec-best-practices",
+    "href": "docs/mixed_precision.html#sec-best-practices",
+    "title": "Mixed Precision Training",
+    "section": "4 Best Practices",
+    "text": "4 Best Practices\n\n4.1 Choosing Precision Format\n\nStart with automatic detection: bf16: auto\nFor Hopper+ (H100/H200): Try FP8 + torch.compile for maximum speed\nFor Ampere (A100/RTX 30/40): Use BF16\nFor older Pascal/Turing GPUs: Use FP16 with caution\nFor very old or unsupported GPUs: Use FP32\n\n\n\n4.2 Validation and Testing\nAlways validate your mixed precision setup:\n\nStart with a small dataset to verify stability\nMonitor loss curves for irregularities\nCompare with FP32 baseline when possible\nTest evaluation metrics match expectations\n\n\n\n4.3 FP8 Particulars\n\nUse cases\n\nSingle GPU training\nMulti GPU training with FSDP2 or Deepspeed\n\nSpeedups\n\nPlease refer to the TorchAO FP8 training benchmarks for expected matmul speedups for different (M, K, N) settings\nConcrete number for LLaMA 3 8B training can be found here\n\nKnown issues:\n\nFP8 + DDP + torch.compile (causes error)\nFP8 + FSDP2 + torch.compile + FSDP2 activation checkpointing tends to be slower than the BF16 equivalent training\nFlash Attention 2 does not play nicely with torch.compile\n\n\nSee examples/llama-3/3b-fp8-fsdp2.yaml for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model\nFor more information on multi-GPU training, see our Multi-GPU guide.",
+    "crumbs": [
+      "Core Concepts",
+      "Mixed Precision Training"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html",
+    "href": "docs/installation.html",
+    "title": "Installation",
+    "section": "",
+    "text": "This guide covers all the ways you can install and set up Axolotl for your environment.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-requirements",
+    "href": "docs/installation.html#sec-requirements",
+    "title": "Installation",
+    "section": "1 Requirements",
+    "text": "1 Requirements\n\nNVIDIA GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU\nPython ≥3.11\nPyTorch ≥2.6.0",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-installation-methods",
+    "href": "docs/installation.html#sec-installation-methods",
+    "title": "Installation",
+    "section": "2 Installation Methods",
+    "text": "2 Installation Methods\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure to have Pytorch installed before installing Axolotl in your local environment.\nFollow the instructions at: https://pytorch.org/get-started/locally/\n\n\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.\n\n\n\n2.1 PyPI Installation (Recommended)\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nWe use --no-build-isolation in order to detect the installed PyTorch version (if\ninstalled) in order not to clobber it, and so that we set the correct version of\ndependencies that are specific to the PyTorch version or other installed\nco-dependencies.\n\n\n2.2 uv Installation\nuv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.\nInstall uv if not already installed\ncurl -LsSf https://astral.sh/uv/install.sh | sh\nsource $HOME/.local/bin/env\nChoose your CUDA version to use with PyTorch; e.g. cu124, cu126, cu128,\nthen create the venv and activate\nexport UV_TORCH_BACKEND=cu126\nuv venv --no-project --relocatable\nsource .venv/bin/activate\nInstall PyTorch\n- PyTorch 2.6.0 recommended\nuv pip install packaging setuptools wheel\nuv pip install torch==2.6.0\nuv pip install awscli pydantic\nInstall axolotl from PyPi\nuv pip install --no-build-isolation axolotl[deepspeed,flash-attn]\n\n# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO\nuv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]\n\n\n2.3 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\n2.4 Docker\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n  --name axolotl --ipc=host \\\n  --ulimit memlock=-1 --ulimit stack=67108864 \\\n  --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n  axolotlai/axolotl:main-latest\n\n\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use axolotlai/axolotl:main-py3.11-cu128-2.7.0 or the cloud variant axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0.\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-cloud",
+    "href": "docs/installation.html#sec-cloud",
+    "title": "Installation",
+    "section": "3 Cloud Environments",
+    "text": "3 Cloud Environments\n\n3.1 Cloud GPU Providers\nFor providers supporting Docker:\n\nUse axolotlai/axolotl-cloud:main-latest\nAvailable on:\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n3.2 Google Colab\nUse our example notebook.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-platform-specific",
+    "href": "docs/installation.html#sec-platform-specific",
+    "title": "Installation",
+    "section": "4 Platform-Specific Instructions",
+    "text": "4 Platform-Specific Instructions\n\n4.1 macOS\npip3 install --no-build-isolation -e '.'\nSee Section 6 for Mac-specific issues.\n\n\n4.2 Windows\n\n\n\n\n\n\nImportant\n\n\n\nWe recommend using WSL2 (Windows Subsystem for Linux) or Docker.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-env-managers",
+    "href": "docs/installation.html#sec-env-managers",
+    "title": "Installation",
+    "section": "5 Environment Managers",
+    "text": "5 Environment Managers\n\n5.1 Conda/Pip venv\n\nInstall Python ≥3.11\nInstall PyTorch: https://pytorch.org/get-started/locally/\nInstall Axolotl:\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n(Optional) Login to Hugging Face:\nhuggingface-cli login",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/installation.html#sec-troubleshooting",
+    "href": "docs/installation.html#sec-troubleshooting",
+    "title": "Installation",
+    "section": "6 Troubleshooting",
+    "text": "6 Troubleshooting\nIf you encounter installation issues, see our FAQ and Debugging Guide.",
+    "crumbs": [
+      "Getting Started",
+      "Installation"
+    ]
+  },
+  {
+    "objectID": "docs/mac.html",
+    "href": "docs/mac.html",
+    "title": "Mac M-series",
+    "section": "",
+    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested:\n\nFSDP",
     "crumbs": [
       "Deployments",
-      "AMD GPUs on HPC Systems"
+      "Mac M-series"
     ]
   },
   {
-    "objectID": "docs/amd_hpc.html#setup",
-    "href": "docs/amd_hpc.html#setup",
-    "title": "AMD GPUs on HPC Systems",
-    "section": "Setup",
-    "text": "Setup\n\n1. Install Python\nWe recommend using Miniforge, a minimal conda-based Python distribution:\ncurl -L -O \"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh\"\nbash Miniforge3-$(uname)-$(uname -m).sh\n\n\n2. Configure Python Environment\nAdd Python to your PATH and ensure it’s available at login:\necho 'export PATH=~/miniforge3/bin:$PATH' &gt;&gt; ~/.bashrc\necho 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' &gt;&gt; ~/.bash_profile\n\n\n3. Load AMD GPU Software\nLoad the ROCm module:\nmodule load rocm/5.7.1\nNote: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.\n\n\n4. Install PyTorch\nInstall PyTorch with ROCm support:\npip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall\n\n\n5. Install Flash Attention\nClone and install the Flash Attention repository:\ngit clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git\nexport GPU_ARCHS=\"gfx90a\"\ncd flash-attention\nexport PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')\npatch \"${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py\" hipify_patch.patch\npip install --no-build-isolation .\n\n\n6. Install Axolotl\nClone and install Axolotl:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\npip install packaging ninja\npip install --no-build-isolation -e .\n\n\n7. Apply xformers Workaround\nxformers appears to be incompatible with ROCm. Apply the following workarounds:\n- Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers.\n- Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.\n\n\n8. Prepare Job Submission Script\nCreate a script for job submission using your HPC’s particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include\nexport TRANSFORMERS_OFFLINE=1\nexport HF_DATASETS_OFFLINE=1\n\n\n9. Download Base Model\nDownload a base model using the Hugging Face CLI:\nhuggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B\n\n\n10. Create Axolotl Configuration\nCreate an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.\nNote: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.\n\n\n11. Preprocess Data\nRun preprocessing on the login node:\nCUDA_VISIBLE_DEVICES=\"\" python -m axolotl.cli.preprocess /path/to/your/config.yaml\n\n\n12. Train\nYou are now ready to submit your previously prepared job script. 🚂",
-    "crumbs": [
-      "Deployments",
-      "AMD GPUs on HPC Systems"
-    ]
-  },
-  {
-    "objectID": "docs/multipack.html",
-    "href": "docs/multipack.html",
-    "title": "Multipack (Sample Packing)",
+    "objectID": "docs/sequence_parallelism.html",
+    "href": "docs/sequence_parallelism.html",
+    "title": "Sequence Parallelism",
     "section": "",
-    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
+    "text": "Sequence parallelism is a technique that splits sequences across multiple GPUs,\nallowing you to train with very long sequences that wouldn’t fit on a single GPU. Each\nGPU processes a different portion of the sequence, and the results are aggregated\nthrough a ring communication pattern.",
     "crumbs": [
-      "Core Concepts",
-      "Multipack (Sample Packing)"
+      "Advanced Features",
+      "Sequence Parallelism"
     ]
   },
   {
-    "objectID": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
-    "href": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
-    "title": "Multipack (Sample Packing)",
-    "section": "",
-    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
+    "objectID": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
+    "href": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
+    "title": "Sequence Parallelism",
+    "section": "When to Use Sequence Parallelism",
+    "text": "When to Use Sequence Parallelism\nUse sequence parallelism when:\n\nYou need to train with sequence lengths that don’t fit into a single GPU’s memory\nYou have multiple GPUs available\nYou’re experiencing OOM (Out Of Memory) errors with long sequences",
     "crumbs": [
-      "Core Concepts",
-      "Multipack (Sample Packing)"
+      "Advanced Features",
+      "Sequence Parallelism"
     ]
   },
   {
-    "objectID": "docs/multipack.html#multipack-without-flash-attention",
-    "href": "docs/multipack.html#multipack-without-flash-attention",
-    "title": "Multipack (Sample Packing)",
-    "section": "Multipack without Flash Attention",
-    "text": "Multipack without Flash Attention\nMultipack can still be achieved without Flash attention, but with lower packing\nefficiency as we are not able to join multiple batches into a single batch due to\ncontext length limits without flash attention. We can use either Pytorch’s Scaled\nDot Product Attention implementation or native Pytorch attention implementation\nalong with 4d attention masks\nto pack sequences together and avoid cross attention.",
+    "objectID": "docs/sequence_parallelism.html#configuration",
+    "href": "docs/sequence_parallelism.html#configuration",
+    "title": "Sequence Parallelism",
+    "section": "Configuration",
+    "text": "Configuration\nTo enable sequence parallelism, add the following to your configuration file:\n# Set to a divisor (&gt; 1) of the number of GPUs available\ncontext_parallel_size: 4  # Split sequences across 4 GPUs\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n# Optional; one of \"varlen_llama3\" or \"batch_ring\". Defaults to\n# \"varlen_llama3\" when `sample_packing: true`, and \"batch_ring\" otherwise.\nring_attn_func:\nThe context_parallel_size should be a divisor of the total number of GPUs. For example:\n\nWith 8 GPUs, valid values would be 2, 4, or 8\nWith 4 GPUs, valid values would be 2 or 4",
     "crumbs": [
-      "Core Concepts",
-      "Multipack (Sample Packing)"
+      "Advanced Features",
+      "Sequence Parallelism"
     ]
   },
   {
-    "objectID": "docs/lora_optims.html",
-    "href": "docs/lora_optims.html",
-    "title": "LoRA Optimizations",
-    "section": "",
-    "text": "Inspired by Unsloth, we’ve implemented two\noptimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU\n(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function\nTriton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was\nto leverage operator fusion and tensor re-use in order to improve speed and reduce\nmemory usage during the forward and backward passes of these calculations.\nWe currently support several common model architectures, including (but not limited to):",
+    "objectID": "docs/sequence_parallelism.html#implementation-details",
+    "href": "docs/sequence_parallelism.html#implementation-details",
+    "title": "Sequence Parallelism",
+    "section": "Implementation Details",
+    "text": "Implementation Details\nWhen sequence parallelism is enabled:\n\nEach sequence is divided into equal chunks across the GPUs in a sequence parallel group\nThe data collator handles the chunking of input_ids, attention_mask, labels, and position_ids\nPosition IDs are adjusted to maintain proper relative positions\nThe trainer uses special ring communication patterns for attention operations",
     "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
+      "Advanced Features",
+      "Sequence Parallelism"
     ]
   },
   {
-    "objectID": "docs/lora_optims.html#usage",
-    "href": "docs/lora_optims.html#usage",
-    "title": "LoRA Optimizations",
-    "section": "Usage",
-    "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n\n\n\n\n\nNote\n\n\n\nCurrently, LoRA kernels are not supported for RLHF training, only SFT.",
-    "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
-    ]
-  },
-  {
-    "objectID": "docs/lora_optims.html#requirements",
-    "href": "docs/lora_optims.html#requirements",
-    "title": "LoRA Optimizations",
+    "objectID": "docs/sequence_parallelism.html#requirements",
+    "href": "docs/sequence_parallelism.html#requirements",
+    "title": "Sequence Parallelism",
     "section": "Requirements",
-    "text": "Requirements\n\nOne or more NVIDIA or AMD GPUs (in order to use the Triton kernels)\n\nNote: Set TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 to enable memory-efficient attention on AMD GPUs\n\nTargeted LoRA adapters cannot use Dropout\n\nThis may limit model expressivity / cause overfitting\n\nTargeted LoRA adapters cannot have bias terms\n\nThis may limit model expressivity\n\n\nModels with pre-existing LoRA adapters that use Dropout or have bias terms may need to\nbe re-finetuned without these features in order to be useful.",
+    "text": "Requirements\nTo use sequence parallelism, you need:\n\nMultiple GPUs (at least 2)\nThe ring-flash-attn package. Install with:\n\npip install axolotl[ring-flash-attn] (preferred)\npip install ring-flash-attn&gt;=0.1.4",
     "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
+      "Advanced Features",
+      "Sequence Parallelism"
     ]
   },
   {
-    "objectID": "docs/lora_optims.html#implementation-details",
-    "href": "docs/lora_optims.html#implementation-details",
-    "title": "LoRA Optimizations",
-    "section": "Implementation details",
-    "text": "Implementation details\n\nCustom autograd functions\nThe LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the\nLoRA and base weight computations together and provides a single, efficient backward\npass for the entire MLP block.\nFor attention components, similar optimizations are provided through a function that\nhandles the query, key, and value projections, and a function that handles the output\nprojection. They are designed to work with the existing transformers attention\nimplementation via some monkey-patching logic.\n\n\nTriton kernels\nTwo activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for\nimproved speed and memory performance. These kernels handle both the forward and\nbackward passes.\n\n\nIntegration\nThe custom autograd functions and Triton kernels are designed to work together. The\nautograd function manages the high-level computation flow and gradient tracking, while\ncalling the Triton kernels for the activation function computation. During the backward\npass, the kernel computes both the activation output and the required gradients, which\nthe autograd function then uses to compute the final gradients for the entire\ncomputation path.",
+    "objectID": "docs/sequence_parallelism.html#limitations",
+    "href": "docs/sequence_parallelism.html#limitations",
+    "title": "Sequence Parallelism",
+    "section": "Limitations",
+    "text": "Limitations\n\nFlash attention must be enabled for this to work (flash_attention: true in config YAML)\nMay have a small performance overhead due to communication between GPUs",
     "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
+      "Advanced Features",
+      "Sequence Parallelism"
     ]
   },
   {
-    "objectID": "docs/lora_optims.html#future-work",
-    "href": "docs/lora_optims.html#future-work",
-    "title": "LoRA Optimizations",
-    "section": "Future Work",
-    "text": "Future Work\n\nSupport for additional model architectures\nSupport for the FSDP setting\nSupport for dropout and bias\nAdditional operator fusions",
-    "crumbs": [
-      "How To Guides",
-      "LoRA Optimizations"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html",
-    "href": "docs/inference.html",
-    "title": "Inference and Merging",
-    "section": "",
-    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-quickstart",
-    "href": "docs/inference.html#sec-quickstart",
-    "title": "Inference and Merging",
-    "section": "1 Quick Start",
-    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-advanced",
-    "href": "docs/inference.html#sec-advanced",
-    "title": "Inference and Merging",
-    "section": "2 Advanced Usage",
-    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-merging",
-    "href": "docs/inference.html#sec-merging",
-    "title": "Inference and Merging",
-    "section": "3 Merging LoRA Weights",
-    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-tokenization",
-    "href": "docs/inference.html#sec-tokenization",
-    "title": "Inference and Merging",
-    "section": "4 Tokenization",
-    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/inference.html#sec-troubleshooting",
-    "href": "docs/inference.html#sec-troubleshooting",
-    "title": "Inference and Merging",
-    "section": "5 Troubleshooting",
-    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
-    "crumbs": [
-      "Getting Started",
-      "Inference and Merging"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html",
-    "href": "docs/lr_groups.html",
-    "title": "Learning Rate Groups",
-    "section": "",
-    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html#background",
-    "href": "docs/lr_groups.html#background",
-    "title": "Learning Rate Groups",
-    "section": "",
-    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
-    "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
-    ]
-  },
-  {
-    "objectID": "docs/lr_groups.html#example",
-    "href": "docs/lr_groups.html#example",
-    "title": "Learning Rate Groups",
+    "objectID": "docs/sequence_parallelism.html#example",
+    "href": "docs/sequence_parallelism.html#example",
+    "title": "Sequence Parallelism",
     "section": "Example",
-    "text": "Example\nlr_groups:\n  - name: o_proj\n    modules:\n      - self_attn.o_proj.weight\n    lr: 1e-6\n  - name: q_proj\n    modules:\n      - model.layers.2.self_attn.q_proj.weight\n    lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate\nof 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s\nself attention q_proj module.",
+    "text": "Example\nbase_model: meta-llama/Llama-3-8B-Instruct\nsequence_len: 8192\n\n...\n\ncontext_parallel_size: 4  # Split each sequence into 4 parts, one per GPU\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n# Optional; one of \"varlen_llama3\" or \"batch_ring\". Defaults to\n# \"varlen_llama3\" when `sample_packing: true`, and \"batch_ring\" otherwise.\nring_attn_func:\n\n...\nThis will train the Llama 3 8B model with 8K context length, with each sequence split\ninto 2 subsequences of length 4096 across 2 GPUs.",
     "crumbs": [
-      "How To Guides",
-      "Learning Rate Groups"
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
+    "href": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
+    "title": "Sequence Parallelism",
+    "section": "Sample Packing with Sequence Parallelism",
+    "text": "Sample Packing with Sequence Parallelism\nSequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:\n\nSamples are first packed together\nThe packed sequences are then divided across GPUs in the sequence parallel group\nPosition IDs are automatically adjusted to maintain proper relative positions",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/sequence_parallelism.html#effect-on-batch-size",
+    "href": "docs/sequence_parallelism.html#effect-on-batch-size",
+    "title": "Sequence Parallelism",
+    "section": "Effect on Batch Size",
+    "text": "Effect on Batch Size\nWhen using sequence parallelism, your effective global batch size is divided by the context_parallel_size. This happens because:\n\nEach group of context_parallel_size GPUs works on the same batch (just different parts of each sequence)\nThe number of batches processed per step decreases\n\nFor example:\n- With 8 GPUs and no sequence parallelism: 8 different batches processed per step\n- With 8 GPUs and context_parallel_size=4: Only 2 different batches processed per step (each split across 4 GPUs)\n- If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4",
+    "crumbs": [
+      "Advanced Features",
+      "Sequence Parallelism"
+    ]
+  },
+  {
+    "objectID": "docs/faq.html",
+    "href": "docs/faq.html",
+    "title": "FAQ",
+    "section": "",
+    "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: exitcode: -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: exitcode: -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\nQ: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?\n\nA: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_&lt;model_name&gt;.py file within transformers library.\n\nQ: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token\n\nA: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:\n\n\nspecial_tokens:\n  # str. If you're not sure, set to same as `eos_token`.\n  pad_token: \"...\"\n\nQ: IterableDataset error or KeyError: 'input_ids' when using preprocess CLI\n\nA: This is because you may be using preprocess CLI with pretraining_dataset: or skip_prepare_dataset: true respectively. Please use axolotl train CLI directly instead as these datasets are prepared on demand.\n\nQ: vLLM is not working with Axolotl\n\nA: We currently recommend torch 2.6.0 for use with vllm. Please ensure you use the right version. For Docker, please use the main-py3.11-cu124-2.6.0 tag.\n\nQ: FA2 2.8.0 undefined symbol runtime error on CUDA 12.4\n\nA: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS token is incorrectly being masked or not being masked / EOS token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThis is because of the mismatch between tokenizer.eos_token and EOS token in template. Please make sure to set eos_token: under special_tokens: to the same EOS token as in template.\n\n\n\n\nThe EOS token is not in the template. Please check if your template is correct. As an example, phi_35 template does not use its dedicated EOS token &lt;|endoftext|&gt; at the end.\n\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.\n\nQ: The EOT token(s) are incorrectly being masked or not being masked / EOT token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThe EOT token is different from the EOS token and was not specified under eot_tokens:. Please set eot_tokens: to the same EOT token(s) as in template.\n\n\n\n\nThere is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.\n\n\nQ: EOT token encoding failed. Please check if the token is valid and can be encoded.\n\nA: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.\n\nQ: EOT token __ is encoded as multiple tokens.\n\nA: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:.\n\nQ: Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot\n\nA: This is because the EOS token is in the eot_tokens: while mismatch between train_on_eos: and train_on_eot:. This will cause one to override the other. Please ensure that train_on_eos: and train_on_eot: are the same or remove the EOS token from eot_tokens:.\n\nQ: If eot_tokens: is not provided, what happens?\n\nA: If eot_tokens: is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.\n\n\nInternally, eot_tokens: tokenizer.eos_token and train_on_eot: train_on_eos (which defaults to turn). This transition helps clarify the naming and behavior of EOT/EOS tokens.\n\nQ: Data processing error: CAS service error\n\nA: Try disabling XET with export HF_HUB_DISABLE_XET=1\n\nQ: torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice.\n\nA: Depending on the version of torch, you may need to include this in your YAML:\n\n\nflex_attn_compile_kwargs:\n  dynamic: false\n  mode: max-autotune-no-cudagraphs\n\n**Q: ValueError(\"Backward pass should have cleared tracker of all tensors\")\n\nA: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with offload_activations: legacy in your YAML.",
+    "crumbs": [
+      "Troubleshooting",
+      "FAQ"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/tokenized.html",
+    "href": "docs/dataset-formats/tokenized.html",
+    "title": "Custom Pre-Tokenized Dataset",
+    "section": "",
+    "text": "Pass an empty type: in your axolotl config.\nColumns in Dataset must be exactly input_ids, attention_mask, labels\nTo indicate that a token should be ignored during training, set its corresponding label to -100.\nYou must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.\nFor pretraining, do not truncate/pad documents to the context window length.\nFor instruction training, documents must be truncated/padded as desired.\n\nSample config:\n\n\nconfig.yml\n\ndatasets:\n  - path: /path/to/your/file.jsonl\n    ds_type: json\n    type:\n\nSample jsonl:\n{\"input_ids\":[271,299,99],\"attention_mask\":[1,1,1],\"labels\":[271,-100,99]}\n{\"input_ids\":[87,227,8383,12],\"attention_mask\":[1,1,1,1],\"labels\":[87,227,8383,12]}",
+    "crumbs": [
+      "Dataset Formats",
+      "Custom Pre-Tokenized Dataset"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/stepwise_supervised.html",
+    "href": "docs/dataset-formats/stepwise_supervised.html",
+    "title": "Stepwise Supervised Format",
+    "section": "",
+    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
+    "crumbs": [
+      "Dataset Formats",
+      "Stepwise Supervised Format"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
+    "href": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
+    "title": "Stepwise Supervised Format",
+    "section": "",
+    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
+    "crumbs": [
+      "Dataset Formats",
+      "Stepwise Supervised Format"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/pretraining.html",
+    "href": "docs/dataset-formats/pretraining.html",
+    "title": "Pre-training",
+    "section": "",
+    "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset:\n  - name:\n    path:\n    split:\n    text_column: # column in dataset with the data, usually `text`\n    type: pretrain\n    trust_remote_code:\n    skip: # number of rows of data to skip over from the beginning",
+    "crumbs": [
+      "Dataset Formats",
+      "Pre-training"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/template_free.html",
+    "href": "docs/dataset-formats/template_free.html",
+    "title": "Template-Free",
+    "section": "",
+    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
+    "crumbs": [
+      "Dataset Formats",
+      "Template-Free"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/template_free.html#sec-background",
+    "href": "docs/dataset-formats/template_free.html#sec-background",
+    "title": "Template-Free",
+    "section": "",
+    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
+    "crumbs": [
+      "Dataset Formats",
+      "Template-Free"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/template_free.html#sec-usage",
+    "href": "docs/dataset-formats/template_free.html#sec-usage",
+    "title": "Template-Free",
+    "section": "Usage",
+    "text": "Usage\nThis is how you can use the input_output format:\n\n1. Prepare Data\nTo use the input_output format, collect your data in the following\nformat into a jsonl file (below is the first row from the file\noutput.jsonl` pretty printed):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\n\nSet label:false when you want to mask a segment of text so that the\nmodel isn’t trained on it. Some things to keep in mind:\n\n[!IMPORTANT]\n1. EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl\nconcatenates all the segments as-is. The tokenizer doesn’t add\nanything additional. Notice how I added spaces, newlines, &lt;s&gt;\n(BOS), and &lt;/s&gt; (EOS) myself.\n2. Make sure you check the materialized output to validate that the\nprompt is getting assembled how you like.\n\n\n\n2. Use type: input_output\nLet’s materialize data with our output.jsonl file by setting\ntype: input_output in our axolotl config:\n# training_config.yaml\nbase_model: mistralai/Mistral-7B-v0.1\ndata_seed: 49\nseed: 49\n\ndatasets:\n  - path: output.jsonl\n    type: input_output\nval_set_size: 0.1\n\nsequence_len: 896\nsample_packing: false\n\nmicro_batch_size: 2\ngradient_accumulation_steps: 3\neval_batch_size: 2\nnum_epochs: 1\nlearning_rate: 0.0002\n\ntrain_on_inputs: false\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\nYou can use the following command to materialize your data. The\n--debug flag will print the tokens, along with the labels so you can\nverify that the correct items are being ignored:\naxolotl preprocess training_config.yaml --debug\n\n...\n[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] &lt;s&gt;(1, 1) Hello(22557, 22557)\n(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) &lt;/s&gt;(2, 2)\nThe format is decoded_token(label, token_id), for example,\n&lt;s&gt;(1, 1) means that the token is &lt;s&gt;, the label is 1 and the\ntoken_id is 1. When the label is -100 then that token is ignored for\ntraining.\n\n\n3. Check the prompts\nHere is another way to check the materialized output:\nfrom transformers import AutoTokenizer\nfrom datasets import load_from_disk\nimport yaml\n\ndirectory = !ls last_run_prepared/\nwith open('training_config.yaml', 'r') as f:\n    cfg = yaml.safe_load(f)\nmodel_id = cfg['base_model']\ntok = AutoTokenizer.from_pretrained(model_id)\nds = load_from_disk(f'last_run_prepared/{directory[0]}/')\n&gt;&gt;&gt; row = ds[0]\n&gt;&gt;&gt; print(tok.decode(row['input_ids']))\n&lt;s&gt; Hello\n    hi there!.  goodbye  farewell&lt;/s&gt;\nWe can check that the right tokens are ignored by comparing the labels\nto each token:\nimport pandas as pd\npd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in\n              zip(row['input_ids'], row['labels'])])\n\n\n\ntoken\nlabel\nid\n\n\n\n\n0\n&lt;s&gt;\n1\n\n\n1\nHello\n22557\n\n\n2\n\\n\n13\n\n\n3\nhi\n12014\n\n\n4\nthere\n736\n\n\n5\n!\n28808\n\n\n6\n.\n28723\n\n\n7\n\n28705\n\n\n8\ngood\n-100\n\n\n9\nbye\n-100\n\n\n10\n\n-100\n\n\n11\nfare\n19111\n\n\n12\nwell\n5458\n\n\n13\n&lt;/s&gt;\n2\n\n\n\nIf we look at the input data, the above table seems correct! (The jsonl\nversion is repeated below for reference):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}",
+    "crumbs": [
+      "Dataset Formats",
+      "Template-Free"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/index.html",
+    "href": "docs/dataset-formats/index.html",
+    "title": "Dataset Formats",
+    "section": "",
+    "text": "Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.\nAs there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.\nAxolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.",
+    "crumbs": [
+      "Dataset Formats"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/index.html#pre-training",
+    "href": "docs/dataset-formats/index.html#pre-training",
+    "title": "Dataset Formats",
+    "section": "Pre-training",
+    "text": "Pre-training\nWhen aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.\nA sample format for a pre-training dataset is as follows:\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\nIt is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.\nAxolotl supports loading from a Hugging Face hub repo or from local files.\n\nPre-training from Hugging Face hub datasets\nAs an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:\npretraining_dataset: hf_org/name\n\n\nPre-training from local dataset files\nGiven a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:\npretraining_dataset:\n  - path: json\n    data_files:\n      - A.jsonl\n      - B.jsonl\n      - C.jsonl\nWhile we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset\n\n\nPre-training without streaming\nOn the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.\nOne benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.\nFrom Hugging Face:\ndatasets:\n  - path: hf_org/name\n    type: completion\nFrom local files:\ndatasets:\n  - path: A.jsonl\n    type: completion\n\n  - path: B.jsonl\n    type: completion\n\n\n\n\n\n\nImportant\n\n\n\nFor completion only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for pretraining_dataset too, please let us know or help make a PR!\n\n\n\n\nPre-training dataset configuration tips\n\nSetting max_steps\nWhen using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.\nTherefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.\nOne step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.\n\n\nGroup_by_length\nIt is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.\n\n\n\nReference\nPlease see docs here.",
+    "crumbs": [
+      "Dataset Formats"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
+    "href": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
+    "title": "Dataset Formats",
+    "section": "Supervised fine-tuning (SFT)",
+    "text": "Supervised fine-tuning (SFT)\nSupervised fine-tuning is the process of training models to respond to an instruction or chat input.\nAs there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.\nAxolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.\nA flow chart is as follows:\n\nDo you already have the dataset tokenized? If yes, check Pre-Tokenized Dataset.\nDo you want to format the dataset yourself and manually choose each section to mask? If yes, check Template Free Dataset\nIs your dataset in a “conversation” format, containing a list[messages]? If yes, check Conversation Dataset\nIs your dataset in an “instruct” format, containing { instruction, response }? If yes, check Instruction Dataset\n\nIf you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.\n\n\n\n\n\n\nTip\n\n\n\nYou can mix and match within each approach or across approaches to train a model on a variety of datasets.\n\n\n\nPre-Tokenized Dataset\nWe suggest this approach when you want to bring your own tokenized dataset.\nAxolotl expects the dataset to have three keys:\n\ninput_ids: from tokenizing formatted prompt\nattention_mask: for masking padding. If you don’t add padding, it would be equal to len(input_ids) * [1]\nlabels: this is the same as input_ids, however, if you want to mask certain tokens, you would set those indices to -100.\n\n\n\n\n\n\n\nTip\n\n\n\nMake sure to add BOS/EOS tokens to your prompt and mask it appropriately.\n\n\nA config for this would look like:\ndatasets:\n  - path: A.jsonl\n    type:\n\n\n\n\n\n\nNote\n\n\n\ntype: is empty!\n\n\nReference: Pre-Tokenized Dataset Documentation.\n\n\nTemplate Free Dataset\nWe reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.\nIn the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\nEach prompt must be have a key called segments which is a list of { text, label }.\ndatasets:\n  - path: A.jsonl\n    type: input_output\nReference: Template Free Documentation.\n\n\nConversation Dataset\nconversation messages are a list of messages which usually contain a role and content key.\n\n\n\n\n\n\nTip\n\n\n\nFun fact: Axolotl synonymously refers to “chat” messages as conversation messages due to how FastChat initially used this term to build a widely used fastchat conversation method for formatting chat messages prior to the creation of chat_templates.\n\n\n\nWhat are chat_templates?\nThe current most popular and convenient method for inference is to use chat_templates for formatting prompts. Axolotl supports using chat_templates for training to ensure that the model performs in the same environment as in inference.\nHere’s a quick rundown on chat_template: A chat_template is a Jinja2 template which formats a list of messages into a prompt.\nAn example of a prompt formatted into a popular template called ChatML can be seen below:\nSingle prompt (pretty-printed):\n{\n    \"messages\": [\n        {\n            \"role\": \"user\",\n            \"content\": \"Hi\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"How can I help you?\"\n        },\n        {\n            \"role\": \"user\",\n            \"content\": \"Can you add 3+5?\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"The answer is 8.\"\n        }\n    ]\n}\nThe ChatML template is as follows:\n{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'&lt;|im_start|&gt;' + message['role'] + '\\n' + message['content'] + '&lt;|im_end|&gt;' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '&lt;|im_start|&gt;assistant\\n' }}{% endif %}\nThe above prompt formatted into this template will result in:\n&lt;|im_start|&gt;user\nHi&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nHow can I help you?&lt;|im_end|&gt;\n&lt;|im_start|&gt;user\nCan you add 3+5?&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nThe answer is 8.&lt;|im_end|&gt;\nBy using delimiters (&lt;|im_start|&gt; and &lt;|im_end|&gt;), a prompt separates different speakers which helps the model identify which portion belongs to whom.\n\n\nCommon Conversation Dataset formats\nOlder conversation datasets with the following format are colloquially called sharegpt datasets.\n{\"conversations\": [{\"from\": \"...\", \"value\": \"...\"}]}\nNewer conversation datasets usually follow the OpenAI format.\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}]}\nAxolotl supports both as well as allowing customization of any kind of key.\n\n\nChat Template Usage\nTo properly use this method, it is important to identify three things:\n\nWhich chat_template would you use?\nWhat are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be messages, role, and content, respectively, whereas the possible roles are system, user, and assistant.\nWhat do you want to mask? For instance, only assistant messages, only last message, or nothing.\n\n\nChoosing a chat_template\nThere are a lot of chat_templates out there. Axolotl supports the common ones: supported chat templates. For example, to use ChatML, it would be chat_template: chatml.\nHowever, it is also possible to use the already configured template within the tokenizer by specifying chat_template: tokenizer_default. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do chat_template: tokenizer_default_fallback_chatml to fallback to the ChatML template if a tokenizer template was not found.\nOne last but powerful approach is to bring your own template. This can be set via:\nchat_template_jinja: # your template\n\n\nSetting chat_template dataset keys\nWe currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.\nIf your dataset format is different, here are the keys you should check (with their defaults):\ndatasets:\n    ...\n    field_messages: messages  # this should point to the key containing the list of conversations\n    message_property_mappings:  # this is a mapping from keys in your dataset to keys in chat_template\n      role: role\n      content: content\nIn some chat_templates (e.g. Gemma), the roles are hardcoded to user and assistant. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a KeyError, it would be necessary to add mapping for your roles. Here is an example of how it would look like:\ndatasets:\n    ...\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\nIn the example above, all gpt and model values are converted to assistant. All human values are converted to user.\n\n\nHandling masking\nThe common use case for chat_template is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.\nTo train on all assistant messages, you would set the following configs.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\nThe train_on_eos config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: all and last to choose which EOS to train on.\nPerhaps, you want to train on assistant and narrator roles, you can simply add narrator to the list of roles_to_train. You would also need to add it to the mapping of roles above.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\", \"narrator\"]\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\n      narrator: [\"narrator\"]\n\n\n\n\n\n\nTip\n\n\n\nAs chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, ChatML uses &lt;|im_end|&gt; to end turns.\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\n\n\n\n\nApplying chat_template\nOnce all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.\ndatasets:\n  - path: A.jsonl\n    type: chat_template\n\n    # step 1\n    chat_template: chatml\n\n    # step 2\n    field_messages: messages\n    message_property_mappings:\n      role: role\n      content: content\n\n    roles:\n      assistant:\n        - gpt\n        - model\n        - assistant\n      user:\n        - human\n        - user\n\n    # step 3\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\n\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\nIf this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via axolotl preprocess config.yaml --debug):\n&lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Hi(-100, 13347) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) How(4438, 4438)  can(649, 649)  I(358, 358)  help(1520, 1520)  you(499, 499) ?(30, 30) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Can(-100, 6854)  you(-100, 499)  add(-100, 923)  (-100, 220) 3(-100, 18) +(-100, 10) 5(-100, 20) ?(-100, 30) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) The(791, 791)  answer(4320, 4320)  is(374, 374)  (220, 220) 8(23, 23) .(13, 13) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198)\nThe first number refers to the label, the second refers to the token_id. For example, -100 labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the token_id.\n\n\n\n\n\n\nNote\n\n\n\nIf during preprocess, there are a lot of warnings of Could not find content __ boundary, please check the FAQ section for chat_templates.\n\n\n\n\n\nReference\nPlease see docs here.\n\n\n\nInstruction Dataset\nInstruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.\nAn example is of a common format called Alpaca:\n{\"instruction\": \"...\", \"input\": \"...\", \"output\": \"...\"}\nUsing those keys, a prompt can be built based on it.\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{output}\nThis can be configured as such:\ndatasets:\n  - path: A.jsonl\n    type: alpaca\nAxolotl supports many kinds of instruction dataset. All of them can be found in the Instruction Dataset Documentation with their respective type and sample row format.\n\nCustom Instruct Prompt Format\nDue to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.\nIn the example below, a sample row is used to output in mistral_v1 format.\n{\"input\": \"...\", \"output\": \"...\"}\ndatasets:\n  - path: repo\n    type:\n      system_prompt: \"\"\n\n      field_system:\n      field_instruction: input\n      field_input:\n      field_output: output\n\n      # multi-line example with input\n      format: |-\n        [INST] {instruction} {input} [/INST]\n\n      # single-line example without input\n      no_input_format: \"[INST] {instruction} [/INST]\"\nThe config sets that the field_instruction is actually named input, and the field_input is empty as we don’t have an input in this sample. Generally, instruction can be thought as the question to the model, and input as the additional information with output being the response. It is not necessary to have an input nor system. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.\nReference: Custom Instruct Prompt Format Documentation.",
+    "crumbs": [
+      "Dataset Formats"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
+    "href": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
+    "title": "Dataset Formats",
+    "section": "Reinforcement Learning from Human Feedback (RLHF)",
+    "text": "Reinforcement Learning from Human Feedback (RLHF)\nAs there are multiple RLHF methods with their own dataset requirements. Please see RLHF documentation for more detail.",
+    "crumbs": [
+      "Dataset Formats"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/conversation.html",
+    "href": "docs/dataset-formats/conversation.html",
+    "title": "Conversation",
+    "section": "",
+    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}, {\"role\": \"...\", \"content\": \"...\"}, ...]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\n\n\n\n\n\nNote\n\n\n\nIf you want to use built-in chat_template, use chat_template: tokenizer_default (this is set by default).\n\n\n\n\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n\n\nInstead of passing tools via the system prompt, an alternative method would be to have the tools in a separate column and loaded via chat_template to let the template dynamically build it.\n{\n    \"tools\": [\n        {\n            \"type\": \"...\",\n            \"function\": {\n                \"name\": \"...\",\n                \"description\": \"...\",\n                \"parameters\": {\n                    \"type\": \"...\",\n                    \"properties\": {\n                        // ...\n                    },\n                    \"required\": [\"...\"],\n                },\n            },\n        },\n    ],\n    \"messages\": [\n        // ...\n        {\n            \"role\": \"assistant\", // call the function via assistant\n            \"tool_calls\": [\n                {\n                    \"id\": \"...\",  // required only for mistral\n                    \"type\": \"function\",\n                    \"function\": {\n                        \"name\": \"...\",\n                        \"arguments\": {\n                            \"...\": \"...\",\n                        }\n                    }\n                }\n            ]\n        },\n        {\n            \"role\": \"tool\",\n            \"tool_call_id\": \"...\",  // required only for mistral\n            \"name\": \"...\",\n            \"content\": \"...\"\n        },\n    ],\n}\n\n\n\n\n\n\nNote\n\n\n\nTools need to follow JSON schema.\n\n\nExample config for Llama4:\nchat_template: llama4\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\n    # field_tools: tools # default is `tools`\n\n\n\n\n\n\nTip\n\n\n\nLook into the chat_template you are using to see if it supports tools and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the tool or ipython role for llama4 template.\n\n\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
+    "crumbs": [
+      "Dataset Formats",
+      "Conversation"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/conversation.html#chat_template",
+    "href": "docs/dataset-formats/conversation.html#chat_template",
+    "title": "Conversation",
+    "section": "",
+    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}, {\"role\": \"...\", \"content\": \"...\"}, ...]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\n\n\n\n\n\nNote\n\n\n\nIf you want to use built-in chat_template, use chat_template: tokenizer_default (this is set by default).\n\n\n\n\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n\n\nInstead of passing tools via the system prompt, an alternative method would be to have the tools in a separate column and loaded via chat_template to let the template dynamically build it.\n{\n    \"tools\": [\n        {\n            \"type\": \"...\",\n            \"function\": {\n                \"name\": \"...\",\n                \"description\": \"...\",\n                \"parameters\": {\n                    \"type\": \"...\",\n                    \"properties\": {\n                        // ...\n                    },\n                    \"required\": [\"...\"],\n                },\n            },\n        },\n    ],\n    \"messages\": [\n        // ...\n        {\n            \"role\": \"assistant\", // call the function via assistant\n            \"tool_calls\": [\n                {\n                    \"id\": \"...\",  // required only for mistral\n                    \"type\": \"function\",\n                    \"function\": {\n                        \"name\": \"...\",\n                        \"arguments\": {\n                            \"...\": \"...\",\n                        }\n                    }\n                }\n            ]\n        },\n        {\n            \"role\": \"tool\",\n            \"tool_call_id\": \"...\",  // required only for mistral\n            \"name\": \"...\",\n            \"content\": \"...\"\n        },\n    ],\n}\n\n\n\n\n\n\nNote\n\n\n\nTools need to follow JSON schema.\n\n\nExample config for Llama4:\nchat_template: llama4\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\n    # field_tools: tools # default is `tools`\n\n\n\n\n\n\nTip\n\n\n\nLook into the chat_template you are using to see if it supports tools and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the tool or ipython role for llama4 template.\n\n\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
+    "crumbs": [
+      "Dataset Formats",
+      "Conversation"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/conversation.html#sharegpt",
+    "href": "docs/dataset-formats/conversation.html#sharegpt",
+    "title": "Conversation",
+    "section": "sharegpt",
+    "text": "sharegpt\n\n\n\n\n\n\nImportant\n\n\n\nShareGPT is deprecated!. Please see chat_template section.",
+    "crumbs": [
+      "Dataset Formats",
+      "Conversation"
+    ]
+  },
+  {
+    "objectID": "docs/dataset-formats/conversation.html#pygmalion",
+    "href": "docs/dataset-formats/conversation.html#pygmalion",
+    "title": "Conversation",
+    "section": "pygmalion",
+    "text": "pygmalion\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"value\": \"...\"}]}",
+    "crumbs": [
+      "Dataset Formats",
+      "Conversation"
     ]
   },
   {
@@ -1862,663 +2339,458 @@
     ]
   },
   {
-    "objectID": "docs/dataset-formats/conversation.html",
-    "href": "docs/dataset-formats/conversation.html",
-    "title": "Conversation",
+    "objectID": "docs/lr_groups.html",
+    "href": "docs/lr_groups.html",
+    "title": "Learning Rate Groups",
     "section": "",
-    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}, {\"role\": \"...\", \"content\": \"...\"}, ...]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\n\n\n\n\n\nNote\n\n\n\nIf you want to use built-in chat_template, use chat_template: tokenizer_default (this is set by default).\n\n\n\n\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n\n\nInstead of passing tools via the system prompt, an alternative method would be to have the tools in a separate column and loaded via chat_template to let the template dynamically build it.\n{\n    \"tools\": [\n        {\n            \"type\": \"...\",\n            \"function\": {\n                \"name\": \"...\",\n                \"description\": \"...\",\n                \"parameters\": {\n                    \"type\": \"...\",\n                    \"properties\": {\n                        // ...\n                    },\n                    \"required\": [\"...\"],\n                },\n            },\n        },\n    ],\n    \"messages\": [\n        // ...\n        {\n            \"role\": \"assistant\", // call the function via assistant\n            \"tool_calls\": [\n                {\n                    \"id\": \"...\",  // required only for mistral\n                    \"type\": \"function\",\n                    \"function\": {\n                        \"name\": \"...\",\n                        \"arguments\": {\n                            \"...\": \"...\",\n                        }\n                    }\n                }\n            ]\n        },\n        {\n            \"role\": \"tool\",\n            \"tool_call_id\": \"...\",  // required only for mistral\n            \"name\": \"...\",\n            \"content\": \"...\"\n        },\n    ],\n}\n\n\n\n\n\n\nNote\n\n\n\nTools need to follow JSON schema.\n\n\nExample config for Llama4:\nchat_template: llama4\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\n    # field_tools: tools # default is `tools`\n\n\n\n\n\n\nTip\n\n\n\nLook into the chat_template you are using to see if it supports tools and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the tool or ipython role for llama4 template.\n\n\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
+    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
     "crumbs": [
-      "Dataset Formats",
-      "Conversation"
+      "How To Guides",
+      "Learning Rate Groups"
     ]
   },
   {
-    "objectID": "docs/dataset-formats/conversation.html#chat_template",
-    "href": "docs/dataset-formats/conversation.html#chat_template",
-    "title": "Conversation",
+    "objectID": "docs/lr_groups.html#background",
+    "href": "docs/lr_groups.html#background",
+    "title": "Learning Rate Groups",
     "section": "",
-    "text": "Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer’s template, a supported template, or custom jinja2.\n\n\ndata.jsonl\n\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}, {\"role\": \"...\", \"content\": \"...\"}, ...]}\n\nSee configs for full configs and supported templates.\n\n\nMost configs can be adapted as follows:\n# old\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: sharegpt\n    conversation: chatml\n\n# new (if using tokenizer's chat_template)\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n\n# new (if setting a new chat_template like chatml, gemma, etc)\nchat_template: chatml\ndatasets:\n  - path: ...\n    type: chat_template\n\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\nWe recommend checking the below examples for other usecases.\n\n\n\n\n\n(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train:\n    train_on_eos:\n\n\n\n\n\n\nTip\n\n\n\nIf you receive an error like “chat_template choice is tokenizer_default but tokenizer’s chat_template is null.”, it means the tokenizer does not have a default chat_template. Follow the examples below instead to set a custom chat_template.\n\n\n\n\n\nUsing the gemma chat template to override the tokenizer_config.json’s chat template on OpenAI messages format, training on all assistant messages.\nchat_template: gemma # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n    roles_to_train: [\"assistant\"]  # default value\n\n\n\n\n\n\nNote\n\n\n\nIf you want to use built-in chat_template, use chat_template: tokenizer_default (this is set by default).\n\n\n\n\n\nUsing the tokenizer_config.json’s chat template or chatml as fallback if the former’s chat template does not exist, on OpenAI messages format, training on all assistant messages.\nchat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\nUsing a custom jinja template on OpenAI messages format, training on all assistant messages.\n# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty\nchat_template_jinja: \"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'&lt;|system|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n'}}{% elif (message['role'] == 'user') %}{{'&lt;|user|&gt;' + '\\n' + message['content'] + '&lt;|end|&gt;' + '\\n' + '&lt;|assistant|&gt;' + '\\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '&lt;|end|&gt;' + '\\n'}}{% endif %}{% endfor %}\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure that your tokenizer.eos_token is same as EOS (End-of-Sequence) token in template. Otherwise, set eos_token under special_tokens:.\n\n\n\n\n\n\nIf you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the eot_tokens: config. The handling of EOT tokens follows train_on_eos: which defaults to turn.\n\neot_tokens:\n  - \"[/INST]\"\n  # - \"[/SYSTEM_PROMPT]\"\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    # optional\n    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)\n\n\n\n\n\n\nTip\n\n\n\nSee config documentation for detailed explanations of “turn”, “last”, and “all” options for training on tokens.\n\n\n\n\n\n\n\n\nNote\n\n\n\nUsing eot_tokens requires each token that exists in chat_template to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.\nYou can add those tokens as new tokens under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:. See config for more details.\n\n\n\nContinuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set train_on_eos: last.\n\neot_tokens:\n  - \"[/INST]\"\n  # ...\n\ndatasets:\n  - path: ...\n    type: chat_template\n\n    train_on_eos: last\n    train_on_eot: turn\n\n\n\n\n\n\nTip\n\n\n\nIf EOS token only appears at the end of a prompt, train_on_eos: last is equivalent to train_on_eos: turn. Therefore, generally, you can leave them to their defaults and omit them.\n\n\n\n\n\nInstead of passing tools via the system prompt, an alternative method would be to have the tools in a separate column and loaded via chat_template to let the template dynamically build it.\n{\n    \"tools\": [\n        {\n            \"type\": \"...\",\n            \"function\": {\n                \"name\": \"...\",\n                \"description\": \"...\",\n                \"parameters\": {\n                    \"type\": \"...\",\n                    \"properties\": {\n                        // ...\n                    },\n                    \"required\": [\"...\"],\n                },\n            },\n        },\n    ],\n    \"messages\": [\n        // ...\n        {\n            \"role\": \"assistant\", // call the function via assistant\n            \"tool_calls\": [\n                {\n                    \"id\": \"...\",  // required only for mistral\n                    \"type\": \"function\",\n                    \"function\": {\n                        \"name\": \"...\",\n                        \"arguments\": {\n                            \"...\": \"...\",\n                        }\n                    }\n                }\n            ]\n        },\n        {\n            \"role\": \"tool\",\n            \"tool_call_id\": \"...\",  // required only for mistral\n            \"name\": \"...\",\n            \"content\": \"...\"\n        },\n    ],\n}\n\n\n\n\n\n\nNote\n\n\n\nTools need to follow JSON schema.\n\n\nExample config for Llama4:\nchat_template: llama4\ndatasets:\n  - path: Nanobit/text-tools-2k-test\n    type: chat_template\n    # field_tools: tools # default is `tools`\n\n\n\n\n\n\nTip\n\n\n\nLook into the chat_template you are using to see if it supports tools and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the tool or ipython role for llama4 template.\n\n\n\n\n\n(Advanced) Using fine-grained control over tokens and turns to train in a conversation\nFor a data sample that looks like:\n\n\ndata.jsonl\n\n{\n  \"conversations\": [\n    {\"from\": \"system\", \"value\": \"You are an AI assistant.\", \"train\": false},\n    {\"from\": \"human\", \"value\": \"Hello\", \"train\": false},\n    {\"from\": \"assistant\", \"value\": \"Hello\", \"train\": true},\n    {\"from\": \"human\", \"value\": \"How are you?\", \"train\": true},\n    {\n      \"from\": \"assistant\",\n      \"value\": \"I'm doing very well, thank you!\",\n      \"train_detail\": [\n        {\"begin_offset\": 0, \"end_offset\": 8, \"train\": false},\n        {\"begin_offset\": 9, \"end_offset\": 18, \"train\": true},\n        {\"begin_offset\": 19, \"end_offset\": 30, \"train\": false},\n      ],\n    },\n    {\n        \"from\": \"human\",\n        \"value\": \"I'm doing very well, thank you!\",\n        \"train\": true,\n    },\n    {\"from\": \"assistant\", \"value\": \"Hi there!\", \"train\": true}\n  ]\n}\n\nThe configuration would look like:\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: tokenizer_default\n    field_messages: conversations\n    message_property_mappings:\n      role: from\n      content: value\n    roles_to_train: []\n    train_on_eos: turn\n    message_field_training: train\n    message_field_training_detail: train_detail\n\n\n\n\n\n\nTip\n\n\n\nIt is not necessary to set both message_field_training and message_field_training_detail at once.\n\n\n\n\n\n(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.\ndatasets:\n  - path: ...\n    type: chat_template\n    chat_template: qwen3\n    split_thinking: true\nFor example, a content can look like:\n{\n  \"content\": \"&lt;think&gt;Some thinking outputs&lt;/think&gt;Output after thinking.\"\n}\nAfter split, it will look like:\n{\n  \"reasoning_content\": \"Some thinking outputs\",\n  \"content\": \"Output after thinking...\"\n}",
+    "text": "Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of\nmodules in a model.",
     "crumbs": [
-      "Dataset Formats",
-      "Conversation"
+      "How To Guides",
+      "Learning Rate Groups"
     ]
   },
   {
-    "objectID": "docs/dataset-formats/conversation.html#sharegpt",
-    "href": "docs/dataset-formats/conversation.html#sharegpt",
-    "title": "Conversation",
-    "section": "sharegpt",
-    "text": "sharegpt\n\n\n\n\n\n\nImportant\n\n\n\nShareGPT is deprecated!. Please see chat_template section.",
-    "crumbs": [
-      "Dataset Formats",
-      "Conversation"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/conversation.html#pygmalion",
-    "href": "docs/dataset-formats/conversation.html#pygmalion",
-    "title": "Conversation",
-    "section": "pygmalion",
-    "text": "pygmalion\n\n\ndata.jsonl\n\n{\"conversations\": [{\"role\": \"...\", \"value\": \"...\"}]}",
-    "crumbs": [
-      "Dataset Formats",
-      "Conversation"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html",
-    "href": "docs/dataset-formats/index.html",
-    "title": "Dataset Formats",
-    "section": "",
-    "text": "Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.\nAs there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.\nAxolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html#pre-training",
-    "href": "docs/dataset-formats/index.html#pre-training",
-    "title": "Dataset Formats",
-    "section": "Pre-training",
-    "text": "Pre-training\nWhen aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports streaming to only load batches into memory at a time.\nA sample format for a pre-training dataset is as follows:\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\nIt is typically recommended to save your dataset as .jsonl due to its flexibility and simplicity.\nAxolotl supports loading from a Hugging Face hub repo or from local files.\n\nPre-training from Hugging Face hub datasets\nAs an example, to train using a Hugging Face dataset hf_org/name, you can pass the following config:\npretraining_dataset: hf_org/name\n\n\nPre-training from local dataset files\nGiven a few corpus files: A.jsonl, B.jsonl, and C.jsonl, your config will look like the below:\npretraining_dataset:\n  - path: json\n    data_files:\n      - A.jsonl\n      - B.jsonl\n      - C.jsonl\nWhile we recommend .jsonl, you can also use the other formats (csv, parquet, arrow, SQL, Webdataset) that are supported by Dataset.load_dataset\n\n\nPre-training without streaming\nOn the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the completion format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.\nOne benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.\nFrom Hugging Face:\ndatasets:\n  - path: hf_org/name\n    type: completion\nFrom local files:\ndatasets:\n  - path: A.jsonl\n    type: completion\n\n  - path: B.jsonl\n    type: completion\n\n\n\n\n\n\nImportant\n\n\n\nFor completion only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for pretraining_dataset too, please let us know or help make a PR!\n\n\n\n\nPre-training dataset configuration tips\n\nSetting max_steps\nWhen using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.\nTherefore, it is necessary to set max_steps: int in your config for pre-training to run, so that Axolotl knows when to stop training.\nOne step is equal to sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus tokens.\n\n\nGroup_by_length\nIt is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.\n\n\n\nReference\nPlease see docs here.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
-    "href": "docs/dataset-formats/index.html#supervised-fine-tuning-sft",
-    "title": "Dataset Formats",
-    "section": "Supervised fine-tuning (SFT)",
-    "text": "Supervised fine-tuning (SFT)\nSupervised fine-tuning is the process of training models to respond to an instruction or chat input.\nAs there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.\nAxolotl provides four approaches for loading datasets, however, it’s easier to work backwards from the dataset you have available to figure out which approach to use.\nA flow chart is as follows:\n\nDo you already have the dataset tokenized? If yes, check Pre-Tokenized Dataset.\nDo you want to format the dataset yourself and manually choose each section to mask? If yes, check Template Free Dataset\nIs your dataset in a “conversation” format, containing a list[messages]? If yes, check Conversation Dataset\nIs your dataset in an “instruct” format, containing { instruction, response }? If yes, check Instruction Dataset\n\nIf you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.\n\n\n\n\n\n\nTip\n\n\n\nYou can mix and match within each approach or across approaches to train a model on a variety of datasets.\n\n\n\nPre-Tokenized Dataset\nWe suggest this approach when you want to bring your own tokenized dataset.\nAxolotl expects the dataset to have three keys:\n\ninput_ids: from tokenizing formatted prompt\nattention_mask: for masking padding. If you don’t add padding, it would be equal to len(input_ids) * [1]\nlabels: this is the same as input_ids, however, if you want to mask certain tokens, you would set those indices to -100.\n\n\n\n\n\n\n\nTip\n\n\n\nMake sure to add BOS/EOS tokens to your prompt and mask it appropriately.\n\n\nA config for this would look like:\ndatasets:\n  - path: A.jsonl\n    type:\n\n\n\n\n\n\nNote\n\n\n\ntype: is empty!\n\n\nReference: Pre-Tokenized Dataset Documentation.\n\n\nTemplate Free Dataset\nWe reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn’t suffice.\nIn the example below, you could see that there is no proper structure. At the same time, it’s very flexible as there are no constraints on how your prompt can look.\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\nEach prompt must be have a key called segments which is a list of { text, label }.\ndatasets:\n  - path: A.jsonl\n    type: input_output\nReference: Template Free Documentation.\n\n\nConversation Dataset\nconversation messages are a list of messages which usually contain a role and content key.\n\n\n\n\n\n\nTip\n\n\n\nFun fact: Axolotl synonymously refers to “chat” messages as conversation messages due to how FastChat initially used this term to build a widely used fastchat conversation method for formatting chat messages prior to the creation of chat_templates.\n\n\n\nWhat are chat_templates?\nThe current most popular and convenient method for inference is to use chat_templates for formatting prompts. Axolotl supports using chat_templates for training to ensure that the model performs in the same environment as in inference.\nHere’s a quick rundown on chat_template: A chat_template is a Jinja2 template which formats a list of messages into a prompt.\nAn example of a prompt formatted into a popular template called ChatML can be seen below:\nSingle prompt (pretty-printed):\n{\n    \"messages\": [\n        {\n            \"role\": \"user\",\n            \"content\": \"Hi\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"How can I help you?\"\n        },\n        {\n            \"role\": \"user\",\n            \"content\": \"Can you add 3+5?\"\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": \"The answer is 8.\"\n        }\n    ]\n}\nThe ChatML template is as follows:\n{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'&lt;|im_start|&gt;' + message['role'] + '\\n' + message['content'] + '&lt;|im_end|&gt;' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '&lt;|im_start|&gt;assistant\\n' }}{% endif %}\nThe above prompt formatted into this template will result in:\n&lt;|im_start|&gt;user\nHi&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nHow can I help you?&lt;|im_end|&gt;\n&lt;|im_start|&gt;user\nCan you add 3+5?&lt;|im_end|&gt;\n&lt;|im_start|&gt;assistant\nThe answer is 8.&lt;|im_end|&gt;\nBy using delimiters (&lt;|im_start|&gt; and &lt;|im_end|&gt;), a prompt separates different speakers which helps the model identify which portion belongs to whom.\n\n\nCommon Conversation Dataset formats\nOlder conversation datasets with the following format are colloquially called sharegpt datasets.\n{\"conversations\": [{\"from\": \"...\", \"value\": \"...\"}]}\nNewer conversation datasets usually follow the OpenAI format.\n{\"messages\": [{\"role\": \"...\", \"content\": \"...\"}]}\nAxolotl supports both as well as allowing customization of any kind of key.\n\n\nChat Template Usage\nTo properly use this method, it is important to identify three things:\n\nWhich chat_template would you use?\nWhat are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be messages, role, and content, respectively, whereas the possible roles are system, user, and assistant.\nWhat do you want to mask? For instance, only assistant messages, only last message, or nothing.\n\n\nChoosing a chat_template\nThere are a lot of chat_templates out there. Axolotl supports the common ones: supported chat templates. For example, to use ChatML, it would be chat_template: chatml.\nHowever, it is also possible to use the already configured template within the tokenizer by specifying chat_template: tokenizer_default. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do chat_template: tokenizer_default_fallback_chatml to fallback to the ChatML template if a tokenizer template was not found.\nOne last but powerful approach is to bring your own template. This can be set via:\nchat_template_jinja: # your template\n\n\nSetting chat_template dataset keys\nWe currently default to OpenAI format for dataset keys, so if that’s your current dataset format, there’s nothing to do here.\nIf your dataset format is different, here are the keys you should check (with their defaults):\ndatasets:\n    ...\n    field_messages: messages  # this should point to the key containing the list of conversations\n    message_property_mappings:  # this is a mapping from keys in your dataset to keys in chat_template\n      role: role\n      content: content\nIn some chat_templates (e.g. Gemma), the roles are hardcoded to user and assistant. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a KeyError, it would be necessary to add mapping for your roles. Here is an example of how it would look like:\ndatasets:\n    ...\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\nIn the example above, all gpt and model values are converted to assistant. All human values are converted to user.\n\n\nHandling masking\nThe common use case for chat_template is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.\nTo train on all assistant messages, you would set the following configs.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\nThe train_on_eos config means that it would mask all EOS tokens for turns that aren’t assistant-turns. The other options are: all and last to choose which EOS to train on.\nPerhaps, you want to train on assistant and narrator roles, you can simply add narrator to the list of roles_to_train. You would also need to add it to the mapping of roles above.\ndatasets:\n    ...\n    roles_to_train: [\"assistant\", \"narrator\"]\n    roles:\n      assistant:\n        - gpt\n        - model\n      user:\n        - human\n      narrator: [\"narrator\"]\n\n\n\n\n\n\nTip\n\n\n\nAs chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer’s EOS, it is highly recommended to set them. For example, ChatML uses &lt;|im_end|&gt; to end turns.\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\n\n\n\n\nApplying chat_template\nOnce all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.\ndatasets:\n  - path: A.jsonl\n    type: chat_template\n\n    # step 1\n    chat_template: chatml\n\n    # step 2\n    field_messages: messages\n    message_property_mappings:\n      role: role\n      content: content\n\n    roles:\n      assistant:\n        - gpt\n        - model\n        - assistant\n      user:\n        - human\n        - user\n\n    # step 3\n    roles_to_train: [\"assistant\"]\n    train_on_eos: \"turn\"\n\nspecial_tokens:\n  eos_token: &lt;|im_end|&gt;\nIf this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via axolotl preprocess config.yaml --debug):\n&lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Hi(-100, 13347) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) How(4438, 4438)  can(649, 649)  I(358, 358)  help(1520, 1520)  you(499, 499) ?(30, 30) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) user(-100, 882)\n(-100, 198) Can(-100, 6854)  you(-100, 499)  add(-100, 923)  (-100, 220) 3(-100, 18) +(-100, 10) 5(-100, 20) ?(-100, 30) &lt;|im_end|&gt;(-100, 128257)\n(-100, 198) &lt;|im_start|&gt;(-100, 128256) assistant(-100, 78191)\n(-100, 198) The(791, 791)  answer(4320, 4320)  is(374, 374)  (220, 220) 8(23, 23) .(13, 13) &lt;|im_end|&gt;(128257, 128257)\n(-100, 198)\nThe first number refers to the label, the second refers to the token_id. For example, -100 labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the token_id.\n\n\n\n\n\n\nNote\n\n\n\nIf during preprocess, there are a lot of warnings of Could not find content __ boundary, please check the FAQ section for chat_templates.\n\n\n\n\n\nReference\nPlease see docs here.\n\n\n\nInstruction Dataset\nInstruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.\nAn example is of a common format called Alpaca:\n{\"instruction\": \"...\", \"input\": \"...\", \"output\": \"...\"}\nUsing those keys, a prompt can be built based on it.\nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n{output}\nThis can be configured as such:\ndatasets:\n  - path: A.jsonl\n    type: alpaca\nAxolotl supports many kinds of instruction dataset. All of them can be found in the Instruction Dataset Documentation with their respective type and sample row format.\n\nCustom Instruct Prompt Format\nDue to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.\nIn the example below, a sample row is used to output in mistral_v1 format.\n{\"input\": \"...\", \"output\": \"...\"}\ndatasets:\n  - path: repo\n    type:\n      system_prompt: \"\"\n\n      field_system:\n      field_instruction: input\n      field_input:\n      field_output: output\n\n      # multi-line example with input\n      format: |-\n        [INST] {instruction} {input} [/INST]\n\n      # single-line example without input\n      no_input_format: \"[INST] {instruction} [/INST]\"\nThe config sets that the field_instruction is actually named input, and the field_input is empty as we don’t have an input in this sample. Generally, instruction can be thought as the question to the model, and input as the additional information with output being the response. It is not necessary to have an input nor system. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.\nReference: Custom Instruct Prompt Format Documentation.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
-    "href": "docs/dataset-formats/index.html#reinforcement-learning-from-human-feedback-rlhf",
-    "title": "Dataset Formats",
-    "section": "Reinforcement Learning from Human Feedback (RLHF)",
-    "text": "Reinforcement Learning from Human Feedback (RLHF)\nAs there are multiple RLHF methods with their own dataset requirements. Please see RLHF documentation for more detail.",
-    "crumbs": [
-      "Dataset Formats"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/template_free.html",
-    "href": "docs/dataset-formats/template_free.html",
-    "title": "Template-Free",
-    "section": "",
-    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
-    "crumbs": [
-      "Dataset Formats",
-      "Template-Free"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/template_free.html#sec-background",
-    "href": "docs/dataset-formats/template_free.html#sec-background",
-    "title": "Template-Free",
-    "section": "",
-    "text": "One of the most popular features of\naxolotl is\nsetting the following configuration value:\ntrain_on_inputs: false\nIf you declare a dataset formats\nsuch as alpaca or chatml, axolotl knows what is an input\n(i.e. human) vs. an output (i.e. the assistant) and masks the input\nlabels so that your model can focus on predicting the outputs only.\n\n\n\nHowever, there are many situations where you don’t want to use one of\nthese formats or templates. This is because they can:\n\nAdd unnecessary boilerplate to your prompts.\nCreate artifacts like special delimiters &lt;|im_start|&gt; that can\nquickly become footguns if you don’t include them correctly at\ninference time.\nEnforce a chat interface when you do not want one. Sometimes you\njust want to fine-tune a model to a very specific task and do NOT\nwant multi-turn conversations, roles, etc.\nLimit you to only certain roles that the template allows.\n\n\n\n\nYou can construct your prompts without a template by using the\ninput_output format, by setting type: input_output in your\nconfiguration file like this:\nconfig.yml\ntrain_on_inputs: false # Mask segments of your data\ndatasets:\n  - path: output.jsonl\n    type: input_output  # use template free prompt construction\nUnlike type: completion, which is also template-free,\ntype: input_output allows you to mask segments of your text. More\ndetails on how this works are described below.",
-    "crumbs": [
-      "Dataset Formats",
-      "Template-Free"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/template_free.html#sec-usage",
-    "href": "docs/dataset-formats/template_free.html#sec-usage",
-    "title": "Template-Free",
-    "section": "Usage",
-    "text": "Usage\nThis is how you can use the input_output format:\n\n1. Prepare Data\nTo use the input_output format, collect your data in the following\nformat into a jsonl file (below is the first row from the file\noutput.jsonl` pretty printed):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}\n\nSet label:false when you want to mask a segment of text so that the\nmodel isn’t trained on it. Some things to keep in mind:\n\n[!IMPORTANT]\n1. EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl\nconcatenates all the segments as-is. The tokenizer doesn’t add\nanything additional. Notice how I added spaces, newlines, &lt;s&gt;\n(BOS), and &lt;/s&gt; (EOS) myself.\n2. Make sure you check the materialized output to validate that the\nprompt is getting assembled how you like.\n\n\n\n2. Use type: input_output\nLet’s materialize data with our output.jsonl file by setting\ntype: input_output in our axolotl config:\n# training_config.yaml\nbase_model: mistralai/Mistral-7B-v0.1\ndata_seed: 49\nseed: 49\n\ndatasets:\n  - path: output.jsonl\n    type: input_output\nval_set_size: 0.1\n\nsequence_len: 896\nsample_packing: false\n\nmicro_batch_size: 2\ngradient_accumulation_steps: 3\neval_batch_size: 2\nnum_epochs: 1\nlearning_rate: 0.0002\n\ntrain_on_inputs: false\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\nYou can use the following command to materialize your data. The\n--debug flag will print the tokens, along with the labels so you can\nverify that the correct items are being ignored:\naxolotl preprocess training_config.yaml --debug\n\n...\n[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] &lt;s&gt;(1, 1) Hello(22557, 22557)\n(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) &lt;/s&gt;(2, 2)\nThe format is decoded_token(label, token_id), for example,\n&lt;s&gt;(1, 1) means that the token is &lt;s&gt;, the label is 1 and the\ntoken_id is 1. When the label is -100 then that token is ignored for\ntraining.\n\n\n3. Check the prompts\nHere is another way to check the materialized output:\nfrom transformers import AutoTokenizer\nfrom datasets import load_from_disk\nimport yaml\n\ndirectory = !ls last_run_prepared/\nwith open('training_config.yaml', 'r') as f:\n    cfg = yaml.safe_load(f)\nmodel_id = cfg['base_model']\ntok = AutoTokenizer.from_pretrained(model_id)\nds = load_from_disk(f'last_run_prepared/{directory[0]}/')\n&gt;&gt;&gt; row = ds[0]\n&gt;&gt;&gt; print(tok.decode(row['input_ids']))\n&lt;s&gt; Hello\n    hi there!.  goodbye  farewell&lt;/s&gt;\nWe can check that the right tokens are ignored by comparing the labels\nto each token:\nimport pandas as pd\npd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in\n              zip(row['input_ids'], row['labels'])])\n\n\n\ntoken\nlabel\nid\n\n\n\n\n0\n&lt;s&gt;\n1\n\n\n1\nHello\n22557\n\n\n2\n\\n\n13\n\n\n3\nhi\n12014\n\n\n4\nthere\n736\n\n\n5\n!\n28808\n\n\n6\n.\n28723\n\n\n7\n\n28705\n\n\n8\ngood\n-100\n\n\n9\nbye\n-100\n\n\n10\n\n-100\n\n\n11\nfare\n19111\n\n\n12\nwell\n5458\n\n\n13\n&lt;/s&gt;\n2\n\n\n\nIf we look at the input data, the above table seems correct! (The jsonl\nversion is repeated below for reference):\n$ head -n1 output.jsonl | python -m json.tool\n\n{\n    \"segments\": [\n        {\n            \"label\": true,\n            \"text\": \"&lt;s&gt;Hello\\n\"\n        },\n        {\n            \"label\": true,\n            \"text\": \"hi there!. \"\n        },\n        {\n            \"label\": false,\n            \"text\": \"goodbye \"\n        },\n        {\n            \"label\": true,\n            \"text\": \"farewell&lt;/s&gt;\"\n        }\n    ]\n}",
-    "crumbs": [
-      "Dataset Formats",
-      "Template-Free"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/pretraining.html",
-    "href": "docs/dataset-formats/pretraining.html",
-    "title": "Pre-training",
-    "section": "",
-    "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset:\n  - name:\n    path:\n    split:\n    text_column: # column in dataset with the data, usually `text`\n    type: pretrain\n    trust_remote_code:\n    skip: # number of rows of data to skip over from the beginning",
-    "crumbs": [
-      "Dataset Formats",
-      "Pre-training"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/stepwise_supervised.html",
-    "href": "docs/dataset-formats/stepwise_supervised.html",
-    "title": "Stepwise Supervised Format",
-    "section": "",
-    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
-    "crumbs": [
-      "Dataset Formats",
-      "Stepwise Supervised Format"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
-    "href": "docs/dataset-formats/stepwise_supervised.html#stepwise-supervised",
-    "title": "Stepwise Supervised Format",
-    "section": "",
-    "text": "The stepwise supervised format is designed for chain-of-thought (COT) reasoning\ndatasets where each example contains multiple completion steps and a preference label\nfor each step.\n\n\nHere’s a simple example of a stepwise supervised dataset entry:\n{\n  \"prompt\": \"Which number is larger, 9.8 or 9.11?\",\n  \"completions\": [\n    \"The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.\",\n    \"Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.\"\n  ],\n  \"labels\": [true, false]\n}",
-    "crumbs": [
-      "Dataset Formats",
-      "Stepwise Supervised Format"
-    ]
-  },
-  {
-    "objectID": "docs/dataset-formats/tokenized.html",
-    "href": "docs/dataset-formats/tokenized.html",
-    "title": "Custom Pre-Tokenized Dataset",
-    "section": "",
-    "text": "Pass an empty type: in your axolotl config.\nColumns in Dataset must be exactly input_ids, attention_mask, labels\nTo indicate that a token should be ignored during training, set its corresponding label to -100.\nYou must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.\nFor pretraining, do not truncate/pad documents to the context window length.\nFor instruction training, documents must be truncated/padded as desired.\n\nSample config:\n\n\nconfig.yml\n\ndatasets:\n  - path: /path/to/your/file.jsonl\n    ds_type: json\n    type:\n\nSample jsonl:\n{\"input_ids\":[271,299,99],\"attention_mask\":[1,1,1],\"labels\":[271,-100,99]}\n{\"input_ids\":[87,227,8383,12],\"attention_mask\":[1,1,1,1],\"labels\":[87,227,8383,12]}",
-    "crumbs": [
-      "Dataset Formats",
-      "Custom Pre-Tokenized Dataset"
-    ]
-  },
-  {
-    "objectID": "docs/faq.html",
-    "href": "docs/faq.html",
-    "title": "FAQ",
-    "section": "",
-    "text": "General\nQ: The trainer stopped and hasn’t progressed in several minutes.\n\nA: Usually an issue with the GPUs communicating with each other. See the NCCL doc\n\nQ: exitcode: -9\n\nA: This usually happens when you run out of system RAM.\n\nQ: exitcode: -7 while using deepspeed\n\nA: Try upgrading deepspeed w: pip install -U deepspeed\n\nQ: AttributeError: ‘DummyOptim’ object has no attribute ‘step’\nQ: ModuleNotFoundError: No module named ‘mpi4py’ using single GPU with deepspeed\n\nA: You may be using deepspeed with single gpu. Please remove the deepspeed: section in the yaml file or --deepspeed CLI flag.\n\nQ: The codes is stuck on saving preprocessed datasets.\n\nA: This is usually an issue with the GPU. This can be resolved through setting the os environment variable CUDA_VISIBLE_DEVICES=0. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.\n\nQ: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.\n\nA: This is likely due to vocab size mismatch. By default, Axolotl expands the model’s embeddings if the tokenizer has more tokens than the model. Please use the axolotl merge-lora command to merge the adapters instead of using your own scripts.\n\n\nOn the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model’s embeddings unless shrink_embeddings: true is set in the config.\n\nQ: How to call Axolotl via custom python scripts?\n\nA: Since Axolotl is just Python, please see src/axolotl/cli/main.py on how each command is called.\n\nQ: How to know the value to use for fsdp_transformer_layer_cls_to_wrap?\n\nA: This is the class name of the transformer layer to wrap with FSDP. For example, for LlamaForCausalLM, the value is LlamaDecoderLayer. To find this for a specific model, check the model’s PreTrainedModel definition and look for _no_split_modules variable in the modeling_&lt;model_name&gt;.py file within transformers library.\n\nQ: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token\n\nA: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:\n\n\nspecial_tokens:\n  # str. If you're not sure, set to same as `eos_token`.\n  pad_token: \"...\"\n\nQ: IterableDataset error or KeyError: 'input_ids' when using preprocess CLI\n\nA: This is because you may be using preprocess CLI with pretraining_dataset: or skip_prepare_dataset: true respectively. Please use axolotl train CLI directly instead as these datasets are prepared on demand.\n\nQ: vLLM is not working with Axolotl\n\nA: We currently recommend torch 2.6.0 for use with vllm. Please ensure you use the right version. For Docker, please use the main-py3.11-cu124-2.6.0 tag.\n\nQ: FA2 2.8.0 undefined symbol runtime error on CUDA 12.4\n\nA: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.\n\n\n\nChat templates\nQ: jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____\n\nA: This means that the property mapping for the stated attribute does not exist when building chat_template prompt. For example, if no attribute 'content', please check you have added the correct mapping for content under message_property_mappings.\n\nQ: Empty template generated for turn ___\n\nA: The content is empty for that turn.\n\nQ: Could not find content start/end boundary for turn __\n\nA: The specific turn’s start/end could not be detected. Please ensure you have set the eos_token following your chat_template. Otherwise, this could be a chat_template which doesn’t use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not [[dummy_message]]. Please let us know about this.\n\nQ: Content end boundary is before start boundary for turn ___\n\nA: This is an edge case which should not occur. Please create an Issue if this happens.\n\nQ: Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.\n\nA: This is likely an empty turn.\n\nQ: The EOS token is incorrectly being masked or not being masked / EOS token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThis is because of the mismatch between tokenizer.eos_token and EOS token in template. Please make sure to set eos_token: under special_tokens: to the same EOS token as in template.\n\n\n\n\nThe EOS token is not in the template. Please check if your template is correct. As an example, phi_35 template does not use its dedicated EOS token &lt;|endoftext|&gt; at the end.\n\n\nQ: “chat_template choice is tokenizer_default but tokenizer’s chat_template is null. Please add a chat_template in tokenizer config”\n\nA: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See chat_template for more details.\n\nQ: The EOT token(s) are incorrectly being masked or not being masked / EOT token __ not found in chat template.\n\nA: There can be two reasons:\n\n\n\nThe EOT token is different from the EOS token and was not specified under eot_tokens:. Please set eot_tokens: to the same EOT token(s) as in template.\n\n\n\n\nThere is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.\n\n\nQ: EOT token encoding failed. Please check if the token is valid and can be encoded.\n\nA: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.\n\nQ: EOT token __ is encoded as multiple tokens.\n\nA: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under tokens: or (recommended) override unused added_tokens via added_tokens_overrides:.\n\nQ: Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot\n\nA: This is because the EOS token is in the eot_tokens: while mismatch between train_on_eos: and train_on_eot:. This will cause one to override the other. Please ensure that train_on_eos: and train_on_eot: are the same or remove the EOS token from eot_tokens:.\n\nQ: If eot_tokens: is not provided, what happens?\n\nA: If eot_tokens: is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.\n\n\nInternally, eot_tokens: tokenizer.eos_token and train_on_eot: train_on_eos (which defaults to turn). This transition helps clarify the naming and behavior of EOT/EOS tokens.\n\nQ: Data processing error: CAS service error\n\nA: Try disabling XET with export HF_HUB_DISABLE_XET=1\n\nQ: torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice.\n\nA: Depending on the version of torch, you may need to include this in your YAML:\n\n\nflex_attn_compile_kwargs:\n  dynamic: false\n  mode: max-autotune-no-cudagraphs\n\n**Q: ValueError(\"Backward pass should have cleared tracker of all tensors\")\n\nA: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with offload_activations: legacy in your YAML.",
-    "crumbs": [
-      "Troubleshooting",
-      "FAQ"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html",
-    "href": "docs/sequence_parallelism.html",
-    "title": "Sequence Parallelism",
-    "section": "",
-    "text": "Sequence parallelism is a technique that splits sequences across multiple GPUs,\nallowing you to train with very long sequences that wouldn’t fit on a single GPU. Each\nGPU processes a different portion of the sequence, and the results are aggregated\nthrough a ring communication pattern.",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
-    "href": "docs/sequence_parallelism.html#when-to-use-sequence-parallelism",
-    "title": "Sequence Parallelism",
-    "section": "When to Use Sequence Parallelism",
-    "text": "When to Use Sequence Parallelism\nUse sequence parallelism when:\n\nYou need to train with sequence lengths that don’t fit into a single GPU’s memory\nYou have multiple GPUs available\nYou’re experiencing OOM (Out Of Memory) errors with long sequences",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#configuration",
-    "href": "docs/sequence_parallelism.html#configuration",
-    "title": "Sequence Parallelism",
-    "section": "Configuration",
-    "text": "Configuration\nTo enable sequence parallelism, add the following to your configuration file:\n# Set to a divisor (&gt; 1) of the number of GPUs available\ncontext_parallel_size: 4  # Split sequences across 4 GPUs\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n# Optional; one of \"varlen_llama3\" or \"batch_ring\". Defaults to\n# \"varlen_llama3\" when `sample_packing: true`, and \"batch_ring\" otherwise.\nring_attn_func:\nThe context_parallel_size should be a divisor of the total number of GPUs. For example:\n\nWith 8 GPUs, valid values would be 2, 4, or 8\nWith 4 GPUs, valid values would be 2 or 4",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#implementation-details",
-    "href": "docs/sequence_parallelism.html#implementation-details",
-    "title": "Sequence Parallelism",
-    "section": "Implementation Details",
-    "text": "Implementation Details\nWhen sequence parallelism is enabled:\n\nEach sequence is divided into equal chunks across the GPUs in a sequence parallel group\nThe data collator handles the chunking of input_ids, attention_mask, labels, and position_ids\nPosition IDs are adjusted to maintain proper relative positions\nThe trainer uses special ring communication patterns for attention operations",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#requirements",
-    "href": "docs/sequence_parallelism.html#requirements",
-    "title": "Sequence Parallelism",
-    "section": "Requirements",
-    "text": "Requirements\nTo use sequence parallelism, you need:\n\nMultiple GPUs (at least 2)\nThe ring-flash-attn package. Install with:\n\npip install axolotl[ring-flash-attn] (preferred)\npip install ring-flash-attn&gt;=0.1.4",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#limitations",
-    "href": "docs/sequence_parallelism.html#limitations",
-    "title": "Sequence Parallelism",
-    "section": "Limitations",
-    "text": "Limitations\n\nFlash attention must be enabled for this to work (flash_attention: true in config YAML)\nMay have a small performance overhead due to communication between GPUs",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#example",
-    "href": "docs/sequence_parallelism.html#example",
-    "title": "Sequence Parallelism",
+    "objectID": "docs/lr_groups.html#example",
+    "href": "docs/lr_groups.html#example",
+    "title": "Learning Rate Groups",
     "section": "Example",
-    "text": "Example\nbase_model: meta-llama/Llama-3-8B-Instruct\nsequence_len: 8192\n\n...\n\ncontext_parallel_size: 4  # Split each sequence into 4 parts, one per GPU\n# Optional; strides across the key dimension. Larger values use more memory but should make training faster.\nheads_k_stride: 1\n# Optional; one of \"varlen_llama3\" or \"batch_ring\". Defaults to\n# \"varlen_llama3\" when `sample_packing: true`, and \"batch_ring\" otherwise.\nring_attn_func:\n\n...\nThis will train the Llama 3 8B model with 8K context length, with each sequence split\ninto 2 subsequences of length 4096 across 2 GPUs.",
+    "text": "Example\nlr_groups:\n  - name: o_proj\n    modules:\n      - self_attn.o_proj.weight\n    lr: 1e-6\n  - name: q_proj\n    modules:\n      - model.layers.2.self_attn.q_proj.weight\n    lr: 1e-5\n\nlearning_rate: 2e-5\nIn this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate\nof 1e-6 for all the self attention o_proj modules across all layers, and a learning are of 1e-5 to the 3rd layer’s\nself attention q_proj module.",
     "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
+      "How To Guides",
+      "Learning Rate Groups"
     ]
   },
   {
-    "objectID": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
-    "href": "docs/sequence_parallelism.html#sample-packing-with-sequence-parallelism",
-    "title": "Sequence Parallelism",
-    "section": "Sample Packing with Sequence Parallelism",
-    "text": "Sample Packing with Sequence Parallelism\nSequence parallelism is compatible with Axolotl’s sample packing functionality. When using both features together:\n\nSamples are first packed together\nThe packed sequences are then divided across GPUs in the sequence parallel group\nPosition IDs are automatically adjusted to maintain proper relative positions",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/sequence_parallelism.html#effect-on-batch-size",
-    "href": "docs/sequence_parallelism.html#effect-on-batch-size",
-    "title": "Sequence Parallelism",
-    "section": "Effect on Batch Size",
-    "text": "Effect on Batch Size\nWhen using sequence parallelism, your effective global batch size is divided by the context_parallel_size. This happens because:\n\nEach group of context_parallel_size GPUs works on the same batch (just different parts of each sequence)\nThe number of batches processed per step decreases\n\nFor example:\n- With 8 GPUs and no sequence parallelism: 8 different batches processed per step\n- With 8 GPUs and context_parallel_size=4: Only 2 different batches processed per step (each split across 4 GPUs)\n- If your per-GPU micro_batch_size is 2, the global batch size decreases from 16 to 4",
-    "crumbs": [
-      "Advanced Features",
-      "Sequence Parallelism"
-    ]
-  },
-  {
-    "objectID": "docs/mac.html",
-    "href": "docs/mac.html",
-    "title": "Mac M-series",
+    "objectID": "docs/inference.html",
+    "href": "docs/inference.html",
+    "title": "Inference and Merging",
     "section": "",
-    "text": "Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.\nCurrent support:\n\nSupport for all models\nFull training of models\nLoRA training\nSample packing\nFP16 and BF16 (awaiting AMP support for MPS in Pytorch)\nTri-dao’s flash-attn (until it is supported use spd_attention as an alternative)\nxformers\nbitsandbytes (meaning no 4/8 bits loading and bnb optimizers)\nqlora\nDeepSpeed\n\nUntested:\n\nFSDP",
+    "text": "This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-quickstart",
+    "href": "docs/inference.html#sec-quickstart",
+    "title": "Inference and Merging",
+    "section": "1 Quick Start",
+    "text": "1 Quick Start\n\n\n\n\n\n\nTip\n\n\n\nUse the same config used for training on inference/merging.\n\n\n\n1.1 Basic Inference\n\nLoRA ModelsFull Fine-tuned Models\n\n\naxolotl inference your_config.yml --lora-model-dir=\"./lora-output-dir\"\n\n\naxolotl inference your_config.yml --base-model=\"./completed-model\"",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-advanced",
+    "href": "docs/inference.html#sec-advanced",
+    "title": "Inference and Merging",
+    "section": "2 Advanced Usage",
+    "text": "2 Advanced Usage\n\n2.1 Gradio Interface\nLaunch an interactive web interface:\naxolotl inference your_config.yml --gradio\n\n\n2.2 File-based Prompts\nProcess prompts from a text file:\ncat /tmp/prompt.txt | axolotl inference your_config.yml \\\n  --base-model=\"./completed-model\" --prompter=None\n\n\n2.3 Memory Optimization\nFor large models or limited memory:\naxolotl inference your_config.yml --load-in-8bit=True",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-merging",
+    "href": "docs/inference.html#sec-merging",
+    "title": "Inference and Merging",
+    "section": "3 Merging LoRA Weights",
+    "text": "3 Merging LoRA Weights\nMerge LoRA adapters with the base model:\naxolotl merge-lora your_config.yml --lora-model-dir=\"./completed-model\"\n\n3.1 Memory Management for Merging\n\nConfiguration OptionsForce CPU Merging\n\n\ngpu_memory_limit: 20GiB  # Adjust based on your GPU\nlora_on_cpu: true        # Process on CPU if needed\n\n\nCUDA_VISIBLE_DEVICES=\"\" axolotl merge-lora ...",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-tokenization",
+    "href": "docs/inference.html#sec-tokenization",
+    "title": "Inference and Merging",
+    "section": "4 Tokenization",
+    "text": "4 Tokenization\n\n4.1 Common Issues\n\n\n\n\n\n\nWarning\n\n\n\nTokenization mismatches between training and inference are a common source of problems.\n\n\nTo debug:\n\nCheck training tokenization:\n\naxolotl preprocess your_config.yml --debug\n\nVerify inference tokenization by decoding tokens before model input\nCompare token IDs between training and inference\n\n\n\n4.2 Special Tokens\nConfigure special tokens in your YAML:\nspecial_tokens:\n  bos_token: \"&lt;s&gt;\"\n  eos_token: \"&lt;/s&gt;\"\n  unk_token: \"&lt;unk&gt;\"\ntokens:\n  - \"&lt;|im_start|&gt;\"\n  - \"&lt;|im_end|&gt;\"",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/inference.html#sec-troubleshooting",
+    "href": "docs/inference.html#sec-troubleshooting",
+    "title": "Inference and Merging",
+    "section": "5 Troubleshooting",
+    "text": "5 Troubleshooting\n\n5.1 Common Problems\n\nMemory IssuesToken IssuesPerformance Issues\n\n\n\nUse 8-bit loading\nReduce batch sizes\nTry CPU offloading\n\n\n\n\nVerify special tokens\nCheck tokenizer settings\nCompare training and inference preprocessing\n\n\n\n\nVerify model loading\nCheck prompt formatting\nEnsure temperature/sampling settings\n\n\n\n\nFor more details, see our debugging guide.",
+    "crumbs": [
+      "Getting Started",
+      "Inference and Merging"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html",
+    "href": "docs/lora_optims.html",
+    "title": "LoRA Optimizations",
+    "section": "",
+    "text": "Inspired by Unsloth, we’ve implemented two\noptimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU\n(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function\nTriton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was\nto leverage operator fusion and tensor re-use in order to improve speed and reduce\nmemory usage during the forward and backward passes of these calculations.\nWe currently support several common model architectures, including (but not limited to):",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#usage",
+    "href": "docs/lora_optims.html#usage",
+    "title": "LoRA Optimizations",
+    "section": "Usage",
+    "text": "Usage\nThese optimizations can be enabled in your Axolotl config YAML file. The\nlora_mlp_kernel option enables the optimized MLP path, while lora_qkv_kernel and\nlora_o_kernel enable the fused query-key-value projection and optimized output\nprojection, respectively.\nlora_mlp_kernel: true\nlora_qkv_kernel: true\nlora_o_kernel: true\n\n\n\n\n\n\nNote\n\n\n\nCurrently, LoRA kernels are not supported for RLHF training, only SFT.",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#requirements",
+    "href": "docs/lora_optims.html#requirements",
+    "title": "LoRA Optimizations",
+    "section": "Requirements",
+    "text": "Requirements\n\nOne or more NVIDIA or AMD GPUs (in order to use the Triton kernels)\n\nNote: Set TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1 to enable memory-efficient attention on AMD GPUs\n\nTargeted LoRA adapters cannot use Dropout\n\nThis may limit model expressivity / cause overfitting\n\nTargeted LoRA adapters cannot have bias terms\n\nThis may limit model expressivity\n\n\nModels with pre-existing LoRA adapters that use Dropout or have bias terms may need to\nbe re-finetuned without these features in order to be useful.",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#implementation-details",
+    "href": "docs/lora_optims.html#implementation-details",
+    "title": "LoRA Optimizations",
+    "section": "Implementation details",
+    "text": "Implementation details\n\nCustom autograd functions\nThe LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the\nLoRA and base weight computations together and provides a single, efficient backward\npass for the entire MLP block.\nFor attention components, similar optimizations are provided through a function that\nhandles the query, key, and value projections, and a function that handles the output\nprojection. They are designed to work with the existing transformers attention\nimplementation via some monkey-patching logic.\n\n\nTriton kernels\nTwo activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for\nimproved speed and memory performance. These kernels handle both the forward and\nbackward passes.\n\n\nIntegration\nThe custom autograd functions and Triton kernels are designed to work together. The\nautograd function manages the high-level computation flow and gradient tracking, while\ncalling the Triton kernels for the activation function computation. During the backward\npass, the kernel computes both the activation output and the required gradients, which\nthe autograd function then uses to compute the final gradients for the entire\ncomputation path.",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/lora_optims.html#future-work",
+    "href": "docs/lora_optims.html#future-work",
+    "title": "LoRA Optimizations",
+    "section": "Future Work",
+    "text": "Future Work\n\nSupport for additional model architectures\nSupport for the FSDP setting\nSupport for dropout and bias\nAdditional operator fusions",
+    "crumbs": [
+      "How To Guides",
+      "LoRA Optimizations"
+    ]
+  },
+  {
+    "objectID": "docs/multipack.html",
+    "href": "docs/multipack.html",
+    "title": "Multipack (Sample Packing)",
+    "section": "",
+    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
+    "crumbs": [
+      "Core Concepts",
+      "Multipack (Sample Packing)"
+    ]
+  },
+  {
+    "objectID": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
+    "href": "docs/multipack.html#visualization-of-multipack-with-flash-attention",
+    "title": "Multipack (Sample Packing)",
+    "section": "",
+    "text": "Because Flash Attention simply drops the attention mask, we do not need to\nconstruct a 4d attention mask. We only need to concatenate the sequences into\na single batch and let flash attention know where each new sequence begins.\n4k context, bsz =4,\neach character represents 256 tokens\nX represents a padding token\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B ]\n   C C C C C C C ]\n   D D D D ]]\n\n[[ E E E E E E E E ]\n [ F F F F ]\n [ G G G ]\n [ H H H H ]]\n\n[[ I I I ]\n [ J J J ]\n [ K K K K K]\n [ L L L ]]\nafter padding to longest input in each step\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A ]\n   B B B B B B X X X X X X ]\n   C C C C C C C X X X X ]\n   D D D D X X X X X X X ]]\n\n[[ E E E E E E E E ]\n [ F F F F X X X X ]\n [ G G G X X X X X ]\n [ H H H H X X X X ]]\n\n[[ I I I X X ]\n [ J J J X X ]\n [ K K K K K ]\n [ L L L X X ]]\nw packing ( note it’s the same effective number of tokens per step, but a true bsz of 1)\n   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5\n[[ A A A A A A A A A A A B B B B B\n   B C C C C C C C D D D D E E E E\n   E E E E F F F F F G G G H H H H\n   I I I J J J J K K K K K L L L X ]]\ncu_seqlens:\n[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]",
+    "crumbs": [
+      "Core Concepts",
+      "Multipack (Sample Packing)"
+    ]
+  },
+  {
+    "objectID": "docs/multipack.html#multipack-without-flash-attention",
+    "href": "docs/multipack.html#multipack-without-flash-attention",
+    "title": "Multipack (Sample Packing)",
+    "section": "Multipack without Flash Attention",
+    "text": "Multipack without Flash Attention\nMultipack can still be achieved without Flash attention, but with lower packing\nefficiency as we are not able to join multiple batches into a single batch due to\ncontext length limits without flash attention. We can use either Pytorch’s Scaled\nDot Product Attention implementation or native Pytorch attention implementation\nalong with 4d attention masks\nto pack sequences together and avoid cross attention.",
+    "crumbs": [
+      "Core Concepts",
+      "Multipack (Sample Packing)"
+    ]
+  },
+  {
+    "objectID": "docs/amd_hpc.html",
+    "href": "docs/amd_hpc.html",
+    "title": "AMD GPUs on HPC Systems",
+    "section": "",
+    "text": "This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.",
     "crumbs": [
       "Deployments",
-      "Mac M-series"
+      "AMD GPUs on HPC Systems"
     ]
   },
   {
-    "objectID": "docs/installation.html",
-    "href": "docs/installation.html",
-    "title": "Installation",
-    "section": "",
-    "text": "This guide covers all the ways you can install and set up Axolotl for your environment.",
+    "objectID": "docs/amd_hpc.html#setup",
+    "href": "docs/amd_hpc.html#setup",
+    "title": "AMD GPUs on HPC Systems",
+    "section": "Setup",
+    "text": "Setup\n\n1. Install Python\nWe recommend using Miniforge, a minimal conda-based Python distribution:\ncurl -L -O \"https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh\"\nbash Miniforge3-$(uname)-$(uname -m).sh\n\n\n2. Configure Python Environment\nAdd Python to your PATH and ensure it’s available at login:\necho 'export PATH=~/miniforge3/bin:$PATH' &gt;&gt; ~/.bashrc\necho 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' &gt;&gt; ~/.bash_profile\n\n\n3. Load AMD GPU Software\nLoad the ROCm module:\nmodule load rocm/5.7.1\nNote: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.\n\n\n4. Install PyTorch\nInstall PyTorch with ROCm support:\npip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall\n\n\n5. Install Flash Attention\nClone and install the Flash Attention repository:\ngit clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git\nexport GPU_ARCHS=\"gfx90a\"\ncd flash-attention\nexport PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')\npatch \"${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py\" hipify_patch.patch\npip install --no-build-isolation .\n\n\n6. Install Axolotl\nClone and install Axolotl:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\npip install packaging ninja\npip install --no-build-isolation -e .\n\n\n7. Apply xformers Workaround\nxformers appears to be incompatible with ROCm. Apply the following workarounds:\n- Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return False for SwiGLU availability from xformers.\n- Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the “SwiGLU” function with a pass statement.\n\n\n8. Prepare Job Submission Script\nCreate a script for job submission using your HPC’s particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include\nexport TRANSFORMERS_OFFLINE=1\nexport HF_DATASETS_OFFLINE=1\n\n\n9. Download Base Model\nDownload a base model using the Hugging Face CLI:\nhuggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B\n\n\n10. Create Axolotl Configuration\nCreate an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.\nNote: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.\n\n\n11. Preprocess Data\nRun preprocessing on the login node:\nCUDA_VISIBLE_DEVICES=\"\" python -m axolotl.cli.preprocess /path/to/your/config.yaml\n\n\n12. Train\nYou are now ready to submit your previously prepared job script. 🚂",
     "crumbs": [
-      "Getting Started",
-      "Installation"
+      "Deployments",
+      "AMD GPUs on HPC Systems"
     ]
   },
   {
-    "objectID": "docs/installation.html#sec-requirements",
-    "href": "docs/installation.html#sec-requirements",
-    "title": "Installation",
-    "section": "1 Requirements",
-    "text": "1 Requirements\n\nNVIDIA GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU\nPython ≥3.11\nPyTorch ≥2.6.0",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-installation-methods",
-    "href": "docs/installation.html#sec-installation-methods",
-    "title": "Installation",
-    "section": "2 Installation Methods",
-    "text": "2 Installation Methods\n\n\n\n\n\n\nImportant\n\n\n\nPlease make sure to have Pytorch installed before installing Axolotl in your local environment.\nFollow the instructions at: https://pytorch.org/get-started/locally/\n\n\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.\n\n\n\n2.1 PyPI Installation (Recommended)\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\nWe use --no-build-isolation in order to detect the installed PyTorch version (if\ninstalled) in order not to clobber it, and so that we set the correct version of\ndependencies that are specific to the PyTorch version or other installed\nco-dependencies.\n\n\n2.2 uv Installation\nuv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.\nInstall uv if not already installed\ncurl -LsSf https://astral.sh/uv/install.sh | sh\nsource $HOME/.local/bin/env\nChoose your CUDA version to use with PyTorch; e.g. cu124, cu126, cu128,\nthen create the venv and activate\nexport UV_TORCH_BACKEND=cu126\nuv venv --no-project --relocatable\nsource .venv/bin/activate\nInstall PyTorch\n- PyTorch 2.6.0 recommended\nuv pip install packaging setuptools wheel\nuv pip install torch==2.6.0\nuv pip install awscli pydantic\nInstall axolotl from PyPi\nuv pip install --no-build-isolation axolotl[deepspeed,flash-attn]\n\n# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO\nuv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]\n\n\n2.3 Edge/Development Build\nFor the latest features between releases:\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\n2.4 Docker\ndocker run --gpus '\"all\"' --rm -it axolotlai/axolotl:main-latest\nFor development with Docker:\ndocker compose up -d\n\n\n\n\n\n\nAdvanced Docker Configuration\n\n\n\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it \\\n  --name axolotl --ipc=host \\\n  --ulimit memlock=-1 --ulimit stack=67108864 \\\n  --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl \\\n  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \\\n  axolotlai/axolotl:main-latest\n\n\n\n\n\n\n\n\nImportant\n\n\n\nFor Blackwell GPUs, please use axolotlai/axolotl:main-py3.11-cu128-2.7.0 or the cloud variant axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0.\n\n\nPlease refer to the Docker documentation for more information on the different Docker images that are available.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-cloud",
-    "href": "docs/installation.html#sec-cloud",
-    "title": "Installation",
-    "section": "3 Cloud Environments",
-    "text": "3 Cloud Environments\n\n3.1 Cloud GPU Providers\nFor providers supporting Docker:\n\nUse axolotlai/axolotl-cloud:main-latest\nAvailable on:\n\nRunPod\nVast.ai\nPRIME Intellect\nModal\nNovita\nJarvisLabs.ai\nLatitude.sh\n\n\n\n\n3.2 Google Colab\nUse our example notebook.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-platform-specific",
-    "href": "docs/installation.html#sec-platform-specific",
-    "title": "Installation",
-    "section": "4 Platform-Specific Instructions",
-    "text": "4 Platform-Specific Instructions\n\n4.1 macOS\npip3 install --no-build-isolation -e '.'\nSee Section 6 for Mac-specific issues.\n\n\n4.2 Windows\n\n\n\n\n\n\nImportant\n\n\n\nWe recommend using WSL2 (Windows Subsystem for Linux) or Docker.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-env-managers",
-    "href": "docs/installation.html#sec-env-managers",
-    "title": "Installation",
-    "section": "5 Environment Managers",
-    "text": "5 Environment Managers\n\n5.1 Conda/Pip venv\n\nInstall Python ≥3.11\nInstall PyTorch: https://pytorch.org/get-started/locally/\nInstall Axolotl:\npip3 install -U packaging setuptools wheel ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n(Optional) Login to Hugging Face:\nhuggingface-cli login",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/installation.html#sec-troubleshooting",
-    "href": "docs/installation.html#sec-troubleshooting",
-    "title": "Installation",
-    "section": "6 Troubleshooting",
-    "text": "6 Troubleshooting\nIf you encounter installation issues, see our FAQ and Debugging Guide.",
-    "crumbs": [
-      "Getting Started",
-      "Installation"
-    ]
-  },
-  {
-    "objectID": "docs/mixed_precision.html",
-    "href": "docs/mixed_precision.html",
-    "title": "Mixed Precision Training",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats:",
-    "crumbs": [
-      "Core Concepts",
-      "Mixed Precision Training"
-    ]
+    "text": "monkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\n\n\n\nName\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/mixed_precision.html#sec-fp16",
-    "href": "docs/mixed_precision.html#sec-fp16",
-    "title": "Mixed Precision Training",
-    "section": "1 FP16 Mixed Precision",
-    "text": "1 FP16 Mixed Precision\n\n1.1 Overview\nFP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16.\n\n\n1.2 Configuration\nfp16: true\n\n\n1.3 FP16 Considerations\n\nMay require gradient scaling to prevent underflow\nLess numerically stable than BF16\nCan cause training instability with some model architectures\nConsider using BF16 if your hardware supports it",
-    "crumbs": [
-      "Core Concepts",
-      "Mixed Precision Training"
-    ]
-  },
-  {
-    "objectID": "docs/mixed_precision.html#sec-bf16",
-    "href": "docs/mixed_precision.html#sec-bf16",
-    "title": "Mixed Precision Training",
-    "section": "2 BF16 Mixed Precision",
-    "text": "2 BF16 Mixed Precision\n\n2.1 Overview\nBF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory.\n\n\n2.2 Configuration\n# Automatic BF16 detection (recommended)\nbf16: auto\n\n# Or explicitly enable\nbf16: true\n\n# For evaluation with BF16\nbf16: full  # Equivalent to bf16_full_eval in the HF trainer",
-    "crumbs": [
-      "Core Concepts",
-      "Mixed Precision Training"
-    ]
-  },
-  {
-    "objectID": "docs/mixed_precision.html#sec-fp8",
-    "href": "docs/mixed_precision.html#sec-fp8",
-    "title": "Mixed Precision Training",
-    "section": "3 FP8 Mixed Precision",
-    "text": "3 FP8 Mixed Precision\n\n\n\n\n\n\nNote\n\n\n\nFP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO.\n\n\n\n3.1 What is FP8?\nFP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl’s implementation uses PyTorch’s TorchAO library with “tensorwise” scaling strategy.\n\n\n3.2 Requirements\n\nHopper+ GPUs (H100/H200)\nPyTorch 2.7+ (+ compatible TorchAO version)\nCUDA 12.4+\n\n\n\n3.3 Configuration\nAdd to your YAML config:\n# Enable FP8 mixed precision\nfp8: true\n\n# Optional: Enable FP8 for FSDP all-gather operations\nfp8_enable_fsdp_float8_all_gather: true\n\n# Enable torch.compile (almost always necessary for FP8 speedups)\ntorch_compile: true\n\n\n\n\n\n\nImportant\n\n\n\ntorch.compile is critical for FP8 performance\nFP8 training requires torch_compile: true to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16.\n\n\n\n\n3.4 Advanced FP8 Configs\nFor FSDP (Fully Sharded Data Parallel) training:\nfp8: true\nfp8_enable_fsdp_float8_all_gather: true\n\ntorch_compile: true\n\n# FSDP configuration\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: LlamaDecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true",
-    "crumbs": [
-      "Core Concepts",
-      "Mixed Precision Training"
-    ]
-  },
-  {
-    "objectID": "docs/mixed_precision.html#sec-best-practices",
-    "href": "docs/mixed_precision.html#sec-best-practices",
-    "title": "Mixed Precision Training",
-    "section": "4 Best Practices",
-    "text": "4 Best Practices\n\n4.1 Choosing Precision Format\n\nStart with automatic detection: bf16: auto\nFor Hopper+ (H100/H200): Try FP8 + torch.compile for maximum speed\nFor Ampere (A100/RTX 30/40): Use BF16\nFor older Pascal/Turing GPUs: Use FP16 with caution\nFor very old or unsupported GPUs: Use FP32\n\n\n\n4.2 Validation and Testing\nAlways validate your mixed precision setup:\n\nStart with a small dataset to verify stability\nMonitor loss curves for irregularities\nCompare with FP32 baseline when possible\nTest evaluation metrics match expectations\n\n\n\n4.3 FP8 Particulars\n\nUse cases\n\nSingle GPU training\nMulti GPU training with FSDP2 or Deepspeed\n\nSpeedups\n\nPlease refer to the TorchAO FP8 training benchmarks for expected matmul speedups for different (M, K, N) settings\nConcrete number for LLaMA 3 8B training can be found here\n\nKnown issues:\n\nFP8 + DDP + torch.compile (causes error)\nFP8 + FSDP2 + torch.compile + FSDP2 activation checkpointing tends to be slower than the BF16 equivalent training\nFlash Attention 2 does not play nicely with torch.compile\n\n\nSee examples/llama-3/3b-fp8-fsdp2.yaml for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model\nFor more information on multi-GPU training, see our Multi-GPU guide.",
-    "crumbs": [
-      "Core Concepts",
-      "Mixed Precision Training"
-    ]
-  },
-  {
-    "objectID": "docs/api/core.trainers.trl.html",
-    "href": "docs/api/core.trainers.trl.html",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "href": "docs/api/monkeypatch.trainer_fsdp_optim.html#functions",
+    "title": "monkeypatch.trainer_fsdp_optim",
     "section": "",
-    "text": "core.trainers.trl\nModule for TRL RL trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+    "text": "Name\nDescription\n\n\n\n\npatch_training_loop_for_fsdp\nmonkeypatch for fixing the training loop for fsdp with optimizer save\n\n\n\n\n\nmonkeypatch.trainer_fsdp_optim.patch_training_loop_for_fsdp()\nmonkeypatch for fixing the training loop for fsdp with optimizer save"
   },
   {
-    "objectID": "docs/api/core.trainers.trl.html#classes",
-    "href": "docs/api/core.trainers.trl.html#classes",
-    "title": "core.trainers.trl",
+    "objectID": "docs/api/cli.inference.html",
+    "href": "docs/api/cli.inference.html",
+    "title": "cli.inference",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOTrainer\nExtend the base CPOTrainer for axolotl helpers\n\n\nAxolotlKTOTrainer\nExtend the base KTOTrainer for axolotl helpers\n\n\nAxolotlORPOTrainer\nExtend the base ORPOTrainer for axolotl helpers\n\n\nAxolotlPRMTrainer\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\nAxolotlRewardTrainer\nExtend the base RewardTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.trl.AxolotlCPOTrainer(*args, **kwargs)\nExtend the base CPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlKTOTrainer(*args, **kwargs)\nExtend the base KTOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlORPOTrainer(*args, **kwargs)\nExtend the base ORPOTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlPRMTrainer(*args, **kwargs)\nExtend the base trl.PRMTrainer for axolotl helpers\n\n\n\ncore.trainers.trl.AxolotlRewardTrainer(*args, **kwargs)\nExtend the base RewardTrainer for axolotl helpers"
+    "text": "cli.inference\nCLI to run inference on a trained model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat template\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat template\nis (optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html",
-    "href": "docs/api/cli.cloud.base.html",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/cli.inference.html#functions",
+    "href": "docs/api/cli.inference.html#functions",
+    "title": "cli.inference",
     "section": "",
-    "text": "cli.cloud.base\nbase class for cloud platforms from cli\n\n\n\n\n\nName\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\ndo_inference\nRuns inference on the command line in a loop. User input is accepted, a chat template\n\n\ndo_inference_gradio\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n\n\nget_multi_line_input\nGets multi-line input from terminal.\n\n\n\n\n\ncli.inference.do_cli(config=Path('examples/'), gradio=False, **kwargs)\nParses axolotl config, CLI args, and calls do_inference or do_inference_gradio.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.inference.do_inference(cfg, cli_args)\nRuns inference on the command line in a loop. User input is accepted, a chat template\nis (optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.do_inference_gradio(cfg, cli_args)\nRuns inference in a Gradio interface. User input is accepted, a chat template is\n(optionally) applied, and the model specified in the axolotl config is used to\ngenerate completions according to a default generation config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nInferenceCliArgs\nInference-specific CLI arguments.\nrequired\n\n\n\n\n\n\n\ncli.inference.get_multi_line_input()\nGets multi-line input from terminal.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPossibly multi-line, possibly empty stdin input as a string."
   },
   {
-    "objectID": "docs/api/cli.cloud.base.html#classes",
-    "href": "docs/api/cli.cloud.base.html#classes",
-    "title": "cli.cloud.base",
+    "objectID": "docs/api/utils.collators.mm_chat.html",
+    "href": "docs/api/utils.collators.mm_chat.html",
+    "title": "utils.collators.mm_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCloud\nAbstract base class for cloud platforms.\n\n\n\n\n\ncli.cloud.base.Cloud()\nAbstract base class for cloud platforms."
+    "text": "utils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
   },
   {
-    "objectID": "docs/api/core.chat.format.chatml.html",
-    "href": "docs/api/core.chat.format.chatml.html",
-    "title": "core.chat.format.chatml",
+    "objectID": "docs/api/utils.collators.mm_chat.html#classes",
+    "href": "docs/api/utils.collators.mm_chat.html#classes",
+    "title": "utils.collators.mm_chat",
     "section": "",
-    "text": "core.chat.format.chatml\ncore.chat.format.chatml\nChatML transformation functions for MessageContents"
+    "text": "Name\nDescription\n\n\n\n\nMultiModalChatDataCollator\nCollator for multi-modal chat messages\n\n\n\n\n\nutils.collators.mm_chat.MultiModalChatDataCollator(\n    tokenizer,\n    processing_strategy,\n    packing=False,\n    return_tensors='pt',\n    padding=True,\n    pad_to_multiple_of=None,\n)\nCollator for multi-modal chat messages"
   },
   {
-    "objectID": "docs/api/monkeypatch.multipack.html",
-    "href": "docs/api/monkeypatch.multipack.html",
-    "title": "monkeypatch.multipack",
+    "objectID": "docs/api/utils.model_shard_quant.html",
+    "href": "docs/api/utils.model_shard_quant.html",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "monkeypatch.multipack\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing"
+    "text": "utils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/utils.model_shard_quant.html#functions",
+    "href": "docs/api/utils.model_shard_quant.html#functions",
+    "title": "utils.model_shard_quant",
     "section": "",
-    "text": "utils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
+    "text": "Name\nDescription\n\n\n\n\nload_and_quantize\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\n\n\n\n\n\nutils.model_shard_quant.load_and_quantize(\n    module,\n    name,\n    value,\n    device=None,\n    dtype=None,\n    skip_names=None,\n    to_cpu=False,\n    to_meta=False,\n    verbose=False,\n    quant_method='bnb',\n)\nLoads value tensor into submodule of module, optionally skipping skip_names and converting to dtype.\nQuantizes Params4bit on device then places on “cpu” if to_cpu=True or “meta” if to_meta=True."
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#classes",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAllGatherWithGrad\nCustom autograd function for all-gather to preserve gradients.\n\n\nSequenceParallelContextManager\nContext manager for sequence parallelism operations.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad()\nCustom autograd function for all-gather to preserve gradients.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass for all-gather operation.\n\n\nforward\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.backward(\n    ctx,\n    grad_output,\n)\nBackward pass for all-gather operation.\nExtracts the gradient slice corresponding to this rank’s original input\nfrom the full gradient tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient from subsequent layers with respect to the concatenated output tensor.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None]\nTuple containing the gradient slice for this rank’s input tensor and None for the process group parameter which doesn’t require gradients.\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.AllGatherWithGrad.forward(\n    ctx,\n    input_tensor,\n    group,\n)\nForward pass of all-gather of data with sequence dimension.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\ntorch.autograd function context.\nrequired\n\n\ninput_tensor\ntorch.Tensor\nTensor from model output with sequence dimension.\nrequired\n\n\ngroup\ndist.ProcessGroup\ntorch.distributed process group.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTensor from gathering the input_tensor from across the process group and concatenating along the sequence dimension.\n\n\n\n\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.SequenceParallelContextManager(\n    models,\n    context_parallel_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n    heads_k_stride,\n    gather_outputs,\n    device_mesh=None,\n)\nContext manager for sequence parallelism operations.\nThis class provides a context that will automatically apply sequence parallelism\nduring model forward passes using a pre-forward hook, and gather outputs from\nacross the sequence parallelism group using a post-forward hook.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodels\nlist[nn.Module]\nList of models to apply sequence parallelism to pre- and post- forward hooks.\nrequired\n\n\ncontext_parallel_size\nint\nNumber of processes to split sequences over.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused.\nrequired\n\n\nheads_k_stride\nint | None\nSequence parallelism K head stride size. Passed through to varlen_llama3 ring_flash_attn implementation.\nrequired\n\n\ngather_outputs\nbool\nWhether to gather outputs after model forward pass across the sequence parallel group.\nrequired"
+    "text": "monkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\n\n\n\nName\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
   },
   {
-    "objectID": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
-    "href": "docs/api/utils.ctx_managers.sequence_parallel.html#functions",
-    "title": "utils.ctx_managers.sequence_parallel",
+    "objectID": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "href": "docs/api/monkeypatch.llama_attn_hijack_flash.html#functions",
+    "title": "monkeypatch.llama_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\napply_sequence_parallelism\nApply sequence parallelism slicing to a batch.\n\n\n\n\n\nutils.ctx_managers.sequence_parallel.apply_sequence_parallelism(\n    batch,\n    local_rank,\n    local_world_size,\n    gradient_accumulation_steps,\n    ring_attn_func,\n)\nApply sequence parallelism slicing to a batch.\nSpecial handling is implemented for integer logits_to_keep, which indicates\nto only keep the last N tokens in the sequence during generation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbatch\ndict[str, torch.Tensor]\nBatch dictionary (e.g., input_ids, attention_mask, etc.).\nrequired\n\n\nlocal_rank\nint\nLocal rank in the sequence parallel group.\nrequired\n\n\nlocal_world_size\nint\nWorld size of the sequence parallel group.\nrequired\n\n\ngradient_accumulation_steps\nint\nNumber of steps to accumulate gradients over.\nrequired\n\n\nring_attn_func\nRingAttnFunc\nWhich ring attention function to use. Currently unused, but related to above TODO.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[dict[str, torch.Tensor], int, int]\ntuple of: - Batch dictionary with sliced tensors. - The original sequence length before padding. - The number of padding tokens added."
+    "text": "Name\nDescription\n\n\n\n\nflashattn_forward_with_s2attn\nInput shape: Batch x Time x Channel\n\n\n\n\n\nmonkeypatch.llama_attn_hijack_flash.flashattn_forward_with_s2attn(\n    self,\n    hidden_states,\n    attention_mask=None,\n    position_ids=None,\n    past_key_value=None,\n    output_attentions=False,\n    use_cache=False,\n    padding_mask=None,\n    cu_seqlens=None,\n    max_seqlen=None,\n)\nInput shape: Batch x Time x Channel\nFrom: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py\nattention_mask: [bsz, q_len]\ncu_seqlens will be ignored if provided\nmax_seqlen will be ignored if provided"
   },
   {
-    "objectID": "docs/api/cli.main.html",
-    "href": "docs/api/cli.main.html",
-    "title": "cli.main",
+    "objectID": "docs/api/integrations.grokfast.optimizer.html",
+    "href": "docs/api/integrations.grokfast.optimizer.html",
+    "title": "integrations.grokfast.optimizer",
     "section": "",
-    "text": "cli.main\nClick CLI definitions for various axolotl commands.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
+    "text": "integrations.grokfast.optimizer\nintegrations.grokfast.optimizer"
   },
   {
-    "objectID": "docs/api/cli.main.html#functions",
-    "href": "docs/api/cli.main.html#functions",
-    "title": "cli.main",
+    "objectID": "docs/api/utils.schemas.training.html",
+    "href": "docs/api/utils.schemas.training.html",
+    "title": "utils.schemas.training",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncli\nAxolotl CLI - Train and fine-tune large language models\n\n\nevaluate\nEvaluate a model.\n\n\nfetch\nFetch example configs or other resources.\n\n\ninference\nRun inference with a trained model.\n\n\nmerge_lora\nMerge trained LoRA adapters into a base model.\n\n\nmerge_sharded_fsdp_weights\nMerge sharded FSDP model weights.\n\n\npreprocess\nPreprocess datasets before training.\n\n\ntrain\nTrain or fine-tune a model.\n\n\n\n\n\ncli.main.cli()\nAxolotl CLI - Train and fine-tune large language models\n\n\n\ncli.main.evaluate(ctx, config, launcher, **kwargs)\nEvaluate a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU evaluation (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.fetch(directory, dest)\nFetch example configs or other resources.\nAvailable directories:\n- examples: Example configuration files\n- deepspeed_configs: DeepSpeed configuration files\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndirectory\nstr\nOne of examples, deepspeed_configs.\nrequired\n\n\ndest\nOptional[str]\nOptional destination directory.\nrequired\n\n\n\n\n\n\n\ncli.main.inference(ctx, config, launcher, gradio, **kwargs)\nRun inference with a trained model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for multi-GPU inference (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\ngradio\nbool\nWhether to use Gradio browser interface or command line for inference.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_lora(config, **kwargs)\nMerge trained LoRA adapters into a base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.merge_sharded_fsdp_weights(ctx, config, launcher, **kwargs)\nMerge sharded FSDP model weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nstr\nLauncher to use for weight merging (“accelerate”, “torchrun”, or “python”).\nrequired\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.preprocess(config, cloud=None, **kwargs)\nPreprocess datasets before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\ncloud\nOptional[str]\nPath to a cloud accelerator configuration file.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}\n\n\n\n\n\n\n\ncli.main.train(\n    ctx,\n    config,\n    launcher='accelerate',\n    cloud=None,\n    sweep=None,\n    **kwargs,\n)\nTrain or fine-tune a model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\nclick.Context\nClick context for extra args.\nrequired\n\n\nconfig\nstr\nPath to axolotl config YAML file.\nrequired\n\n\nlauncher\nLiteral['accelerate', 'torchrun', 'python']\nLauncher to use for multi-GPU training (“accelerate”, “torchrun”, or “python”).\n'accelerate'\n\n\ncloud\nstr | None\nPath to a cloud accelerator configuration file\nNone\n\n\nsweep\nstr | None\nPath to YAML config for sweeping hyperparameters.\nNone\n\n\nkwargs\n\nAdditional keyword arguments which correspond to CLI args or axolotl config options.\n{}"
+    "text": "utils.schemas.training\nPydantic models for training hyperparameters\n\n\n\n\n\nName\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
   },
   {
-    "objectID": "docs/api/utils.schemas.multimodal.html",
-    "href": "docs/api/utils.schemas.multimodal.html",
-    "title": "utils.schemas.multimodal",
+    "objectID": "docs/api/utils.schemas.training.html#classes",
+    "href": "docs/api/utils.schemas.training.html#classes",
+    "title": "utils.schemas.training",
     "section": "",
-    "text": "utils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
+    "text": "Name\nDescription\n\n\n\n\nHyperparametersConfig\nTraining hyperparams configuration subset\n\n\nJaggedLRConfig\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\nLrGroup\nCustom learning rate group configuration\n\n\n\n\n\nutils.schemas.training.HyperparametersConfig()\nTraining hyperparams configuration subset\n\n\n\nutils.schemas.training.JaggedLRConfig()\nJaggedLR configuration subset, can be used w/ ReLoRA training\n\n\n\nutils.schemas.training.LrGroup()\nCustom learning rate group configuration"
   },
   {
-    "objectID": "docs/api/utils.schemas.multimodal.html#classes",
-    "href": "docs/api/utils.schemas.multimodal.html#classes",
-    "title": "utils.schemas.multimodal",
+    "objectID": "docs/api/utils.schemas.integrations.html",
+    "href": "docs/api/utils.schemas.integrations.html",
+    "title": "utils.schemas.integrations",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultiModalConfig\nMulti-modal configuration subset\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig()\nMulti-modal configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_image_resize_algorithm\nConvert the image resize algorithm to a PIL.Image.Resampling enum.\n\n\n\n\n\nutils.schemas.multimodal.MultiModalConfig.convert_image_resize_algorithm(\n    image_resize_algorithm,\n)\nConvert the image resize algorithm to a PIL.Image.Resampling enum."
+    "text": "utils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\n\n\n\nName\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html",
-    "href": "docs/api/kernels.swiglu.html",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/utils.schemas.integrations.html#classes",
+    "href": "docs/api/utils.schemas.integrations.html#classes",
+    "title": "utils.schemas.integrations",
     "section": "",
-    "text": "kernels.swiglu\nModule for definition of SwiGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "Name\nDescription\n\n\n\n\nCometConfig\nComet configuration subset\n\n\nGradioConfig\nGradio configuration subset\n\n\nLISAConfig\nLISA configuration subset\n\n\nMLFlowConfig\nMLFlow configuration subset\n\n\nRayConfig\nRay launcher configuration subset\n\n\nWandbConfig\nWandb configuration subset\n\n\n\n\n\nutils.schemas.integrations.CometConfig()\nComet configuration subset\n\n\n\nutils.schemas.integrations.GradioConfig()\nGradio configuration subset\n\n\n\nutils.schemas.integrations.LISAConfig()\nLISA configuration subset\n\n\n\nutils.schemas.integrations.MLFlowConfig()\nMLFlow configuration subset\n\n\n\nutils.schemas.integrations.RayConfig()\nRay launcher configuration subset\n\n\n\nutils.schemas.integrations.WandbConfig()\nWandb configuration subset"
   },
   {
-    "objectID": "docs/api/kernels.swiglu.html#functions",
-    "href": "docs/api/kernels.swiglu.html#functions",
-    "title": "kernels.swiglu",
+    "objectID": "docs/api/utils.schemas.utils.html",
+    "href": "docs/api/utils.schemas.utils.html",
+    "title": "utils.schemas.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nswiglu_backward\nSwiGLU backward pass using in-place operations.\n\n\nswiglu_forward\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\n\n\n\n\n\nkernels.swiglu.swiglu_backward(grad_output, gate, up)\nSwiGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - Forward pass output (h) - Gradient with respect to gate (df) - Gradient with respect to up-projection (de)\n\n\n\n\n\n\n\nkernels.swiglu.swiglu_forward(gate, up)\nSwiGLU forward pass. Computes SwiGLU activation: x * sigmoid(x) * up, where\nx is the gate tensor.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor of shape [batch, seq_len, hidden_dim]."
+    "text": "utils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "href": "docs/api/prompt_strategies.dpo.user_defined.html",
-    "title": "prompt_strategies.dpo.user_defined",
+    "objectID": "docs/api/utils.schemas.utils.html#functions",
+    "href": "docs/api/utils.schemas.utils.html#functions",
+    "title": "utils.schemas.utils",
     "section": "",
-    "text": "prompt_strategies.dpo.user_defined\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies"
+    "text": "Name\nDescription\n\n\n\n\nhandle_legacy_message_fields_logic\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.utils.handle_legacy_message_fields_logic(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\nPreviously, the config only supported mapping ‘role’ and ‘content’ fields via dedicated config options:\n- message_field_role: Mapped to the role field\n- message_field_content: Mapped to the content field\nThe new system uses message_property_mappings to support arbitrary field mappings:\nmessage_property_mappings:\nrole: source_role_field\ncontent: source_content_field\nadditional_field: source_field\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndata\ndict\nDictionary containing configuration data\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ndict\nUpdated dictionary with message field mappings consolidated\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf there are conflicts between legacy and new mappings"
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html",
-    "href": "docs/api/core.trainers.mixins.rng_state_loader.html",
-    "title": "core.trainers.mixins.rng_state_loader",
+    "objectID": "docs/api/cli.checks.html",
+    "href": "docs/api/cli.checks.html",
+    "title": "cli.checks",
     "section": "",
-    "text": "core.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\nSee https://github.com/huggingface/transformers/pull/37162\nTODO: Remove when upstream added PR to release\n\n\n\n\n\nName\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
+    "text": "cli.checks\nVarious checks for Axolotl CLI.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
-    "href": "docs/api/core.trainers.mixins.rng_state_loader.html#classes",
-    "title": "core.trainers.mixins.rng_state_loader",
+    "objectID": "docs/api/cli.checks.html#functions",
+    "href": "docs/api/cli.checks.html#functions",
+    "title": "cli.checks",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nRngLoaderMixin\nmixin for method override to load RNG states from a checkpoint\n\n\n\n\n\ncore.trainers.mixins.rng_state_loader.RngLoaderMixin()\nmixin for method override to load RNG states from a checkpoint"
+    "text": "Name\nDescription\n\n\n\n\ncheck_accelerate_default_config\nLogs at warning level if no accelerate config file is found.\n\n\ncheck_user_token\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\ncli.checks.check_accelerate_default_config()\nLogs at warning level if no accelerate config file is found.\n\n\n\ncli.checks.check_user_token()\nChecks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nBoolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLocalTokenNotFoundError\nIf HF user info can’t be retrieved."
   },
   {
-    "objectID": "docs/api/prompt_strategies.user_defined.html",
-    "href": "docs/api/prompt_strategies.user_defined.html",
-    "title": "prompt_strategies.user_defined",
+    "objectID": "docs/api/utils.callbacks.qat.html",
+    "href": "docs/api/utils.callbacks.qat.html",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "prompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\n\n\n\nName\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
+    "text": "utils.callbacks.qat\nQAT Callback for HF Causal Trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
   },
   {
-    "objectID": "docs/api/prompt_strategies.user_defined.html#classes",
-    "href": "docs/api/prompt_strategies.user_defined.html#classes",
-    "title": "prompt_strategies.user_defined",
+    "objectID": "docs/api/utils.callbacks.qat.html#classes",
+    "href": "docs/api/utils.callbacks.qat.html#classes",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nUserDefinedDatasetConfig\ndataclass configuration representing a userdefined dataset type\n\n\nUserDefinedPromptTokenizationStrategy\nPrompt Tokenization Strategy for user defined prompts\n\n\n\n\n\nprompt_strategies.user_defined.UserDefinedDatasetConfig(\n    system_prompt='',\n    field_system='system',\n    field_instruction='instruction',\n    field_input='input',\n    field_output='output',\n    format='{instruction} {input} ',\n    no_input_format='{instruction} ',\n    system_format='{system}',\n)\ndataclass configuration representing a userdefined dataset type\n\n\n\nprompt_strategies.user_defined.UserDefinedPromptTokenizationStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nPrompt Tokenization Strategy for user defined prompts"
+    "text": "Name\nDescription\n\n\n\n\nQATCallback\nCallback to toggle fake quantization for the model.\n\n\n\n\n\nutils.callbacks.qat.QATCallback(cfg)\nCallback to toggle fake quantization for the model."
   },
   {
-    "objectID": "docs/api/utils.lora.html",
-    "href": "docs/api/utils.lora.html",
-    "title": "utils.lora",
+    "objectID": "docs/api/utils.callbacks.qat.html#functions",
+    "href": "docs/api/utils.callbacks.qat.html#functions",
+    "title": "utils.callbacks.qat",
     "section": "",
-    "text": "utils.lora\nmodule to get the state dict of a merged lora model\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
+    "text": "Name\nDescription\n\n\n\n\ntoggle_fake_quant\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\nutils.callbacks.qat.toggle_fake_quant(mod, enable)\nToggle fake quantization for any fake quantized linear or embedding layers in the model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmod\nnn.Module\nThe module to toggle fake quantization for.\nrequired\n\n\nenable\nbool\nWhether to enable or disable fake quantization.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.lora.html#functions",
-    "href": "docs/api/utils.lora.html#functions",
-    "title": "utils.lora",
+    "objectID": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "href": "docs/api/monkeypatch.mistral_attn_hijack_flash.html",
+    "title": "monkeypatch.mistral_attn_hijack_flash",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nget_lora_merged_state_dict\nCreate and return a state_dict that has the LoRA deltas\n\n\n\n\n\nutils.lora.get_lora_merged_state_dict(model)\nCreate and return a state_dict that has the LoRA deltas\nmerged into the base model’s weights, without modifying model in place.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\ntorch.nn.Module\nA model that has LoRA/PEFT adapters attached.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndict\ndict\nA state_dict of the merged parameters."
+    "text": "monkeypatch.mistral_attn_hijack_flash\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model"
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html",
-    "href": "docs/api/prompt_strategies.pygmalion.html",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/utils.data.sft.html",
+    "href": "docs/api/utils.data.sft.html",
+    "title": "utils.data.sft",
     "section": "",
-    "text": "prompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
+    "text": "utils.data.sft\nData handling specific to SFT.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
   },
   {
-    "objectID": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "href": "docs/api/prompt_strategies.pygmalion.html#classes",
-    "title": "prompt_strategies.pygmalion",
+    "objectID": "docs/api/utils.data.sft.html#functions",
+    "href": "docs/api/utils.data.sft.html#functions",
+    "title": "utils.data.sft",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPygmalionPromptTokenizingStrategy\nTokenizing strategy for Pygmalion.\n\n\nPygmalionPrompter\nPrompter for Pygmalion.\n\n\n\n\n\nprompt_strategies.pygmalion.PygmalionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    *args,\n    **kwargs,\n)\nTokenizing strategy for Pygmalion.\n\n\n\nprompt_strategies.pygmalion.PygmalionPrompter(*args, **kwargs)\nPrompter for Pygmalion."
+    "text": "Name\nDescription\n\n\n\n\nprepare_datasets\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\nutils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)\nPrepare training and evaluation datasets based on configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nTokenizer to use for processing text.\nrequired\n\n\nprocessor\nProcessorMixin | None\nOptional processor for multimodal datasets.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]\nTuple of (train_dataset, eval_dataset, total_steps, prompters)."
   },
   {
-    "objectID": "docs/api/loaders.tokenizer.html",
-    "href": "docs/api/loaders.tokenizer.html",
-    "title": "loaders.tokenizer",
+    "objectID": "docs/api/utils.chat_templates.html",
+    "href": "docs/api/utils.chat_templates.html",
+    "title": "utils.chat_templates",
     "section": "",
-    "text": "loaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
+    "text": "utils.chat_templates\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\nThese templates are used for formatting messages in a conversation."
   },
   {
-    "objectID": "docs/api/loaders.tokenizer.html#functions",
-    "href": "docs/api/loaders.tokenizer.html#functions",
-    "title": "loaders.tokenizer",
+    "objectID": "docs/api/monkeypatch.utils.html",
+    "href": "docs/api/monkeypatch.utils.html",
+    "title": "monkeypatch.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_tokenizer\nLoad and configure the tokenizer based on the provided config.\n\n\nmodify_tokenizer_files\nModify tokenizer files to replace added_tokens strings, save to output directory,\n\n\n\n\n\nloaders.tokenizer.load_tokenizer(cfg)\nLoad and configure the tokenizer based on the provided config.\n\n\n\nloaders.tokenizer.modify_tokenizer_files(\n    tokenizer_path,\n    token_mappings,\n    output_dir,\n)\nModify tokenizer files to replace added_tokens strings, save to output directory,\nand return the path to the modified tokenizer.\nThis only works with reserved tokens that were added to the tokenizer, not tokens\nalready part of the vocab.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer_path\nstr\nPath or name of the original tokenizer\nrequired\n\n\ntoken_mappings\ndict[int, str]\nDict mapping {token_id (int): new_token_string}\nrequired\n\n\noutput_dir\nstr\nDirectory to save the modified tokenizer\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to the modified tokenizer directory\n\n\n\nRef: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941"
+    "text": "monkeypatch.utils\nShared utils for the monkeypatches\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html",
-    "href": "docs/api/utils.callbacks.perplexity.html",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/monkeypatch.utils.html#functions",
+    "href": "docs/api/monkeypatch.utils.html#functions",
+    "title": "monkeypatch.utils",
     "section": "",
-    "text": "utils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "Name\nDescription\n\n\n\n\nget_cu_seqlens\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\nget_cu_seqlens_from_pos_ids\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\nmask_2d_to_4d\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\n\n\n\n\n\nmonkeypatch.utils.get_cu_seqlens(attn_mask)\ngenerate a cumulative sequence length mask for flash attention using attn mask\n\n\n\nmonkeypatch.utils.get_cu_seqlens_from_pos_ids(position_ids)\ngenerate a cumulative sequence length mask for flash attention using pos ids\n\n\n\nmonkeypatch.utils.mask_2d_to_4d(mask, dtype, tgt_len=None)\nExpands attention_mask from [bsz, seq_len] to [bsz, 1, tgt_seq_len, src_seq_len].\nThis expansion handles packed sequences so that sequences share the same attention mask integer value\nwhen they attend to each other within that sequence.\nThis expansion transforms the mask to lower triangular form to prevent future peeking."
   },
   {
-    "objectID": "docs/api/utils.callbacks.perplexity.html#classes",
-    "href": "docs/api/utils.callbacks.perplexity.html#classes",
-    "title": "utils.callbacks.perplexity",
+    "objectID": "docs/api/cli.utils.fetch.html",
+    "href": "docs/api/cli.utils.fetch.html",
+    "title": "cli.utils.fetch",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPerplexity\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity(tokenizer, max_seq_len, stride=512)\nCalculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.\nThis is a custom variant that doesn’t re-tokenize the input or re-load the model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute\nCompute perplexity in a fixed length sliding window across the sequence.\n\n\n\n\n\nutils.callbacks.perplexity.Perplexity.compute(model, references=None)\nCompute perplexity in a fixed length sliding window across the sequence."
+    "text": "cli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "href": "docs/api/monkeypatch.llama_attn_hijack_xformers.html",
-    "title": "monkeypatch.llama_attn_hijack_xformers",
+    "objectID": "docs/api/cli.utils.fetch.html#functions",
+    "href": "docs/api/cli.utils.fetch.html#functions",
+    "title": "cli.utils.fetch",
     "section": "",
-    "text": "monkeypatch.llama_attn_hijack_xformers\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments"
+    "text": "Name\nDescription\n\n\n\n\nfetch_from_github\nSync files from a specific directory in the GitHub repository.\n\n\n\n\n\ncli.utils.fetch.fetch_from_github(dir_prefix, dest_dir=None, max_workers=5)\nSync files from a specific directory in the GitHub repository.\nOnly downloads files that don’t exist locally or have changed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndir_prefix\nstr\nDirectory prefix to filter files (e.g., ‘examples/’, ‘deepspeed_configs/’).\nrequired\n\n\ndest_dir\nstr | None\nLocal destination directory.\nNone\n\n\nmax_workers\nint\nMaximum number of concurrent downloads.\n5"
   },
   {
-    "objectID": "docs/api/utils.optimizers.adopt.html",
-    "href": "docs/api/utils.optimizers.adopt.html",
-    "title": "utils.optimizers.adopt",
+    "objectID": "docs/api/core.builders.causal.html",
+    "href": "docs/api/core.builders.causal.html",
+    "title": "core.builders.causal",
     "section": "",
-    "text": "utils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\nADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)\nTaniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka\n\n\n\n\n\nName\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
+    "text": "core.builders.causal\nBuilder for causal trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
   },
   {
-    "objectID": "docs/api/utils.optimizers.adopt.html#functions",
-    "href": "docs/api/utils.optimizers.adopt.html#functions",
-    "title": "utils.optimizers.adopt",
+    "objectID": "docs/api/core.builders.causal.html#classes",
+    "href": "docs/api/core.builders.causal.html#classes",
+    "title": "core.builders.causal",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nadopt\nFunctional API that performs ADOPT algorithm computation.\n\n\n\n\n\nutils.optimizers.adopt.adopt(\n    params,\n    grads,\n    exp_avgs,\n    exp_avg_sqs,\n    state_steps,\n    foreach=None,\n    capturable=False,\n    differentiable=False,\n    fused=None,\n    grad_scale=None,\n    found_inf=None,\n    has_complex=False,\n    *,\n    beta1,\n    beta2,\n    lr,\n    clip_lambda,\n    weight_decay,\n    decouple,\n    eps,\n    maximize,\n)\nFunctional API that performs ADOPT algorithm computation."
+    "text": "Name\nDescription\n\n\n\n\nHFCausalTrainerBuilder\nBuild the HuggingFace training args/trainer for causal models and reward modeling\n\n\n\n\n\ncore.builders.causal.HFCausalTrainerBuilder(\n    cfg,\n    model,\n    tokenizer,\n    processor=None,\n)\nBuild the HuggingFace training args/trainer for causal models and reward modeling\nusing TRL."
   },
   {
-    "objectID": "docs/api/loaders.patch_manager.html",
-    "href": "docs/api/loaders.patch_manager.html",
-    "title": "loaders.patch_manager",
+    "objectID": "docs/api/utils.schemas.trl.html",
+    "href": "docs/api/utils.schemas.trl.html",
+    "title": "utils.schemas.trl",
     "section": "",
-    "text": "loaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\nApplies pre- and post-model load patches for various fixes and optimizations.\n\n\n\n\n\nName\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config."
+    "text": "utils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
   },
   {
-    "objectID": "docs/api/loaders.patch_manager.html#classes",
-    "href": "docs/api/loaders.patch_manager.html#classes",
-    "title": "loaders.patch_manager",
+    "objectID": "docs/api/utils.schemas.trl.html#classes",
+    "href": "docs/api/utils.schemas.trl.html#classes",
+    "title": "utils.schemas.trl",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPatchManager\nManages the application of patches during the model loading process.\n\n\n\n\n\nloaders.patch_manager.PatchManager(cfg, model_config, inference=False)\nManages the application of patches during the model loading process.\n\n\n\n\n\nName\nDescription\n\n\n\n\nhas_flash_attn\nCheck if flash attention is installed.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_post_model_load_patches\nApply patches that require the model instance.\n\n\napply_post_plugin_pre_model_load_patches\nApply post plugin-pre_model_load load patches based on config.\n\n\napply_pre_model_load_patches\nApply pre-model load patches based on config.\n\n\n\n\n\nloaders.patch_manager.PatchManager.apply_post_model_load_patches(model)\nApply patches that require the model instance.\n\n\n\nloaders.patch_manager.PatchManager.apply_post_plugin_pre_model_load_patches()\nApply post plugin-pre_model_load load patches based on config.\n\n\n\nloaders.patch_manager.PatchManager.apply_pre_model_load_patches()\nApply pre-model load patches based on config."
+    "text": "Name\nDescription\n\n\n\n\nTRLConfig\nInput args for TRL.\n\n\n\n\n\nutils.schemas.trl.TRLConfig()\nInput args for TRL."
   },
   {
-    "objectID": "docs/api/core.trainers.mixins.scheduler.html",
-    "href": "docs/api/core.trainers.mixins.scheduler.html",
-    "title": "core.trainers.mixins.scheduler",
+    "objectID": "docs/api/utils.data.streaming.html",
+    "href": "docs/api/utils.data.streaming.html",
+    "title": "utils.data.streaming",
     "section": "",
-    "text": "core.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
-  },
-  {
-    "objectID": "docs/api/core.trainers.mixins.scheduler.html#classes",
-    "href": "docs/api/core.trainers.mixins.scheduler.html#classes",
-    "title": "core.trainers.mixins.scheduler",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSchedulerMixin\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin()\nMixin class for scheduler setup in CausalTrainer.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_scheduler\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\n\n\n\n\n\ncore.trainers.mixins.scheduler.SchedulerMixin.create_scheduler(\n    num_training_steps,\n    optimizer=None,\n)\nSet up the scheduler. The optimizer of the trainer must have been set up either before this method is called or\npassed as an argument.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nnum_training_steps\nint\nThe number of training steps to do.\nrequired\n\n\noptimizer\ntorch.optim.Optimizer\nThe training optimizer\nNone"
+    "text": "utils.data.streaming\nutils.data.streaming\nData handling specific to streaming datasets."
   },
   {
     "objectID": "docs/api/monkeypatch.unsloth_.html",
@@ -2535,1238 +2807,1032 @@
     "text": "prompt_strategies.base\nprompt_strategies.base\nmodule for base dataset transform strategies"
   },
   {
-    "objectID": "docs/api/utils.data.pretraining.html",
-    "href": "docs/api/utils.data.pretraining.html",
-    "title": "utils.data.pretraining",
+    "objectID": "docs/api/cli.merge_lora.html",
+    "href": "docs/api/cli.merge_lora.html",
+    "title": "cli.merge_lora",
     "section": "",
-    "text": "utils.data.pretraining\nutils.data.pretraining\ndata handling specific to pretraining"
+    "text": "cli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.btlm_attn_hijack_flash.html",
-    "title": "monkeypatch.btlm_attn_hijack_flash",
+    "objectID": "docs/api/cli.merge_lora.html#functions",
+    "href": "docs/api/cli.merge_lora.html#functions",
+    "title": "cli.merge_lora",
     "section": "",
-    "text": "monkeypatch.btlm_attn_hijack_flash\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\n\n\ndo_merge_lora\nCalls transformers’ merge_and_unload on the model given in the axolotl config\n\n\n\n\n\ncli.merge_lora.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_merge_lora. Note that various\nconfig values will be overwritten to allow the LoRA merge logic to work as expected\n(load_in_8bit=False, load_in4bit=False, flash_attention=False, etc.).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf target directory for LoRA merged model does not exist.\n\n\n\n\n\n\n\ncli.merge_lora.do_merge_lora(cfg)\nCalls transformers’ merge_and_unload on the model given in the axolotl config\nalong with the LoRA adapters to combine them into a single base model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_chat.html",
-    "href": "docs/api/prompt_strategies.alpaca_chat.html",
-    "title": "prompt_strategies.alpaca_chat",
+    "objectID": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "href": "docs/api/prompt_strategies.alpaca_instruct.html",
+    "title": "prompt_strategies.alpaca_instruct",
     "section": "",
-    "text": "prompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
+    "text": "prompt_strategies.alpaca_instruct\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class"
   },
   {
-    "objectID": "docs/api/prompt_strategies.alpaca_chat.html#classes",
-    "href": "docs/api/prompt_strategies.alpaca_chat.html#classes",
-    "title": "prompt_strategies.alpaca_chat",
+    "objectID": "docs/api/loaders.constants.html",
+    "href": "docs/api/loaders.constants.html",
+    "title": "loaders.constants",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAlpacaChatPrompter\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\nAlpacaConcisePrompter\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\nAlpacaQAPromptTokenizingStrategy\nTokenizing strategy for AlpacaQA\n\n\nCamelAIPromptTokenizingStrategy\nTokenizing strategy for CamelAI datasets\n\n\nNoSystemPrompter\nNull Prompter with no system prompts\n\n\n\n\n\nprompt_strategies.alpaca_chat.AlpacaChatPrompter()\nAlpaca Chat Prompter extending the system prompt to for chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaConcisePrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Prompter extending the system prompt to ask for concise chat-instruct answers\n\n\n\nprompt_strategies.alpaca_chat.AlpacaQAPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for AlpacaQA\n\n\n\nprompt_strategies.alpaca_chat.CamelAIPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for CamelAI datasets\n\n\n\nprompt_strategies.alpaca_chat.NoSystemPrompter()\nNull Prompter with no system prompts"
+    "text": "loaders.constants\nloaders.constants\nShared constants for axolotl.loaders module"
   },
   {
-    "objectID": "docs/api/core.trainers.mamba.html",
-    "href": "docs/api/core.trainers.mamba.html",
-    "title": "core.trainers.mamba",
+    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html",
+    "href": "docs/api/monkeypatch.transformers_fa_utils.html",
+    "title": "monkeypatch.transformers_fa_utils",
     "section": "",
-    "text": "core.trainers.mamba\nModule for mamba trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
+    "text": "monkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\n\n\n\nName\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
   },
   {
-    "objectID": "docs/api/core.trainers.mamba.html#classes",
-    "href": "docs/api/core.trainers.mamba.html#classes",
-    "title": "core.trainers.mamba",
+    "objectID": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
+    "href": "docs/api/monkeypatch.transformers_fa_utils.html#functions",
+    "title": "monkeypatch.transformers_fa_utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlMambaTrainer\nMamba specific trainer to handle loss calculation\n\n\n\n\n\ncore.trainers.mamba.AxolotlMambaTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nMamba specific trainer to handle loss calculation"
+    "text": "Name\nDescription\n\n\n\n\nfixed_fa_peft_integration_check\nPEFT usually casts the layer norms in float32 for training stability reasons\n\n\n\n\n\nmonkeypatch.transformers_fa_utils.fixed_fa_peft_integration_check(\n    query,\n    key,\n    value,\n    target_dtype=None,\n    preferred_dtype=None,\n)\nPEFT usually casts the layer norms in float32 for training stability reasons\ntherefore the input hidden states gets silently casted in float32. Hence, we need\ncast them back in float16 / bfloat16 just to be sure everything works as expected.\nThis might slowdown training & inference so it is recommended to not cast the LayerNorms!\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nquery\ntorch.Tensor\nInput query states to be passed to Flash Attention API\nrequired\n\n\nkey\ntorch.Tensor\nInput key states to be passed to Flash Attention API\nrequired\n\n\nvalue\ntorch.Tensor\nInput value states to be passed to Flash Attention API\nrequired\n\n\ntarget_dtype\ntorch.dtype, optional\nThe dtype to convert the attention tensors to. Conversion can be ignored by not providing the target dtype.\nNone\n\n\npreferred_dtype\ntorch.dtype, optional\nThe preferred dtype to convert the attention tensors to regardless of the target dtype.\nNone"
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html",
-    "href": "docs/api/utils.samplers.multipack.html",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/monkeypatch.relora.html",
+    "href": "docs/api/monkeypatch.relora.html",
+    "title": "monkeypatch.relora",
     "section": "",
-    "text": "utils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\ninto fixed-capacity batches to optimize memory usage and training throughput.\n\n\n\n\n\nName\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    bin_size=200,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
+    "text": "monkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\n\n\n\nName\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html#classes",
-    "href": "docs/api/utils.samplers.multipack.html#classes",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/monkeypatch.relora.html#classes",
+    "href": "docs/api/monkeypatch.relora.html#classes",
+    "title": "monkeypatch.relora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMultipackBatchSampler\nBatch sampler class for efficient packing of variable-length sequences\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler(\n    sampler,\n    batch_size,\n    batch_max_len,\n    lengths,\n    packing_efficiency_estimate=1.0,\n    drop_last=True,\n    num_count_samples=4,\n    sequential=False,\n    group_size=100000,\n    bin_size=200,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n    **kwargs,\n)\nBatch sampler class for efficient packing of variable-length sequences\nThis sampler packs sequences into fixed-capacity bins (batches) to maximize\nGPU memory utilization and training throughput by reducing padding.\nIt supports both parallel packing (using FFD algorithm) and\nsequential packing (preserving original sequence order).\n\n\n\n\n\nName\nDescription\n\n\n\n\nefficiency\nCalculate the packing efficiency (ratio of tokens used to total token slots).\n\n\ngather_efficiency\nGather and synchronize packing efficiency estimates across all distributed\n\n\ngather_len_batches\nGather and synchronize batch counts across all distributed ranks. Returns\n\n\ngenerate_batches\nGenerate packed batches for training.\n\n\nset_epoch\nSet the epoch number, used for reproducible shuffling across epochs\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.efficiency()\nCalculate the packing efficiency (ratio of tokens used to total token slots).\nHigher is better - 1.0 would mean perfect packing with no wasted space.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_efficiency()\nGather and synchronize packing efficiency estimates across all distributed\nranks.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nfloat\nA conservative efficiency estimate based on the measurements.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.gather_len_batches(num)\nGather and synchronize batch counts across all distributed ranks. Returns\nthe minimum number of batches available on any rank.\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.generate_batches(set_stats=False)\nGenerate packed batches for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nset_stats\nbool\nWhether to update efficiency statistics.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[list[int]]]\nList of batches, where each batch contains multiple bins, and each bin contains multiple sequence indices.\n\n\n\n\n\n\n\nutils.samplers.multipack.MultipackBatchSampler.set_epoch(epoch)\nSet the epoch number, used for reproducible shuffling across epochs"
+    "text": "Name\nDescription\n\n\n\n\nReLoRACallback\nCallback to merge LoRA weights into the base model and save full-weight checkpoints\n\n\n\n\n\nmonkeypatch.relora.ReLoRACallback(cfg)\nCallback to merge LoRA weights into the base model and save full-weight checkpoints"
   },
   {
-    "objectID": "docs/api/utils.samplers.multipack.html#functions",
-    "href": "docs/api/utils.samplers.multipack.html#functions",
-    "title": "utils.samplers.multipack",
+    "objectID": "docs/api/utils.schedulers.html",
+    "href": "docs/api/utils.schedulers.html",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nallocate_sequentially\nSequential allocator that preserves example order.\n\n\nffd_check\nFirst-fit-decreasing bin packing algorithm check.\n\n\npack_group\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\npack_parallel\nPack sequences into bins using parallel processing.\n\n\n\n\n\nutils.samplers.multipack.allocate_sequentially(\n    sequence_lengths,\n    rank,\n    bin_capacity,\n    num_ranks,\n)\nSequential allocator that preserves example order.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nThe lengths of all examples.\nrequired\n\n\nrank\nint\nThe current rank (for distributed training).\nrequired\n\n\nbin_capacity\nint\nThe capacity of each bin (maximum sequence length).\nrequired\n\n\nnum_ranks\nint\nNumber of ranks (processes / GPUs).\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nrank_batches\nlist[list[int]]\nList of batches for the current rank.\n\n\ntotal_tokens_used\nint\nNumber of actual example tokens.\n\n\ntotal_token_slots\nint\nMaximum theoretical number of example tokens (number of bins * bin capacity).\n\n\n\n\n\n\n\nutils.samplers.multipack.ffd_check(sequence_lengths, bin_capacity, num_bins)\nFirst-fit-decreasing bin packing algorithm check.\nChecks if sequences with the given lengths could fit in the specified number of\nbins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nnum_bins\nint\nNumber of bins available.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if all sequences can be packed, False otherwise.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_group(\n    sequence_lengths,\n    group_offset,\n    bin_capacity,\n    max_bins,\n    bin_size,\n    safe_mode=True,\n)\nPack a group of sequences into bins using First-Fit Decreasing algorithm.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\ngroup_offset\nint\nOffset to apply to indices when returning results.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin.\nrequired\n\n\nmax_bins\nint\nMaximum number of bins to use.\nrequired\n\n\nbin_size\nint\nMaximum number of sequences per bin.\nrequired\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[list[int]]\nList of bins, where each bin contains indices of sequences assigned to it.\n\n\n\n\n\n\n\nutils.samplers.multipack.pack_parallel(\n    sequence_lengths,\n    bin_capacity,\n    group_size,\n    bin_size,\n    num_processes=None,\n    safe_mode=True,\n    mp_start_method='fork',\n)\nPack sequences into bins using parallel processing.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nsequence_lengths\nnp.ndarray\nArray of sequence lengths.\nrequired\n\n\nbin_capacity\nint\nMaximum capacity of each bin as total number of tokens.\nrequired\n\n\ngroup_size\nint\nNumber of sequences to process in each group.\nrequired\n\n\nbin_size\nint\nMaximum number of bins to use.\nrequired\n\n\nnum_processes\nint | None\nNumber of parallel processes to use.\nNone\n\n\nsafe_mode\nbool\nIf True, use a more conservative packing approach.\nTrue\n\n\nmp_start_method\nstr | None\nMultiprocessing start method (‘fork’, ‘spawn’, ‘forkserver’). ‘spawn’ is often safer with Numba/PyTorch. Set to None to use system default.\n'fork'\n\n\n\nReturns:\nList of bins, where each bin contains indices of sequences assigned to it."
+    "text": "utils.schedulers\nModule for custom LRScheduler class\n\n\n\n\n\nName\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/api/utils.bench.html",
-    "href": "docs/api/utils.bench.html",
-    "title": "utils.bench",
+    "objectID": "docs/api/utils.schedulers.html#classes",
+    "href": "docs/api/utils.schedulers.html#classes",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "utils.bench\nBenchmarking and measurement utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "Name\nDescription\n\n\n\n\nInterpolatingLogScheduler\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\nJaggedLRRestartScheduler\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\nRexLR\nReflected Exponential (REX) learning rate scheduler.\n\n\n\n\n\nutils.schedulers.InterpolatingLogScheduler(\n    optimizer,\n    num_steps,\n    min_lr,\n    max_lr,\n    last_epoch=-1,\n)\nA scheduler that interpolates learning rates in a logarithmic fashion\n\n\n\nutils.schedulers.JaggedLRRestartScheduler(\n    optimizer,\n    inner_schedule,\n    jagged_restart_steps,\n    jagged_restart_warmup_steps,\n    jagged_restart_anneal_steps=1,\n    min_lr_scale=0.001,\n)\nWraps another scheduler to apply per-lora-restart learning rate warmups.\n\n\n\nutils.schedulers.RexLR(\n    optimizer,\n    max_lr,\n    min_lr,\n    total_steps=0,\n    num_warmup_steps=0,\n    last_step=0,\n)\nReflected Exponential (REX) learning rate scheduler.\n\nOriginal implementation: https://github.com/IvanVassi/REX_LR\nOriginal license: Apache 2.0\nBased on: https://arxiv.org/abs/2107.04197\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\ntorch.optim.Optimizer\nThe optimizer to schedule the learning rate for.\nrequired\n\n\nmax_lr\nfloat\nThe maximum learning rate.\nrequired\n\n\nmin_lr\nfloat\nThe minimum learning rate.\nrequired\n\n\ntotal_steps\nint\nThe total number of training steps.\n0\n\n\nnum_warmup_steps\nint\nThe number of warmup steps.\n0\n\n\nlast_step\nint\nThe index of last step.\n0"
   },
   {
-    "objectID": "docs/api/utils.bench.html#functions",
-    "href": "docs/api/utils.bench.html#functions",
-    "title": "utils.bench",
+    "objectID": "docs/api/utils.schedulers.html#functions",
+    "href": "docs/api/utils.schedulers.html#functions",
+    "title": "utils.schedulers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncheck_cuda_device\nwraps a function and returns the default value instead of running the\n\n\n\n\n\nutils.bench.check_cuda_device(default_value)\nwraps a function and returns the default value instead of running the\nwrapped function if cuda isn’t available or the device is auto\n:param default_value:\n:return:"
+    "text": "Name\nDescription\n\n\n\n\nget_cosine_schedule_with_min_lr\n\n\n\nget_cosine_schedule_with_quadratic_warmup\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\n\n\nget_cosine_schedule_with_warmup_decay_constant\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_min_lr(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    min_lr_ratio=0.0,\n)\n\n\n\nlinear warmup from 0 -&gt; max_lr over num_warmup_steps\ncosine learning rate annealing from max_lr -&gt; min_lr over num_training_steps\n\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_quadratic_warmup(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the\ninitial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nnum_cycles\nfloat, optional, defaults to 0.5\nThe number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine).\n0.5\n\n\nlast_epoch\nint, optional, defaults to -1\nThe index of the last epoch when resuming training.\n-1\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule.\n\n\n\n\nutils.schedulers.get_cosine_schedule_with_warmup_decay_constant(\n    optimizer,\n    num_warmup_steps,\n    num_training_steps,\n    constant_lr_ratio,\n    min_lr_ratio,\n    num_cycles=0.5,\n    last_epoch=-1,\n)\nImplementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)\nCreate a schedule with a learning rate that decreases following the values of the cosine function between the\ninitial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate\n, after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\noptimizer\n[~torch.optim.Optimizer]\nThe optimizer for which to schedule the learning rate.\nrequired\n\n\nnum_warmup_steps\nint\nThe number of steps for the warmup phase.\nrequired\n\n\nnum_training_steps\nint\nThe total number of training steps.\nrequired\n\n\nconstant_lr_ratio\nfloat\n(float): The ratio of num_training_steps to decrease by cosine function.\nrequired\n\n\nmin_lr_ratio\nfloat\n(float): The ratio of maximum learning rate for cosine function to decay to minimum learning rate.                            | _required_ | | num_cycles         |float, *optional*, defaults to 0.5 | The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 following a half-cosine). |0.5| | last_epoch         |int, *optional*, defaults to -1    | The index of the last epoch when resuming training.                                                                            |-1`\n\n\n\n\n\n\n\ntorch.optim.lr_scheduler.LambdaLR with the appropriate schedule."
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html",
-    "href": "docs/api/utils.callbacks.profiler.html",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/kernels.lora.html",
+    "href": "docs/api/kernels.lora.html",
+    "title": "kernels.lora",
     "section": "",
-    "text": "utils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\n\n\n\nName\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+    "text": "kernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\nSee “LoRA: Low-Rank Adaptation of Large Language Models”\n(https://arxiv.org/abs/2106.09685).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
   },
   {
-    "objectID": "docs/api/utils.callbacks.profiler.html#classes",
-    "href": "docs/api/utils.callbacks.profiler.html#classes",
-    "title": "utils.callbacks.profiler",
+    "objectID": "docs/api/kernels.lora.html#classes",
+    "href": "docs/api/kernels.lora.html#classes",
+    "title": "kernels.lora",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nPytorchProfilerCallback\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.\n\n\n\n\n\nutils.callbacks.profiler.PytorchProfilerCallback(\n    steps_to_profile=5,\n    profiler_steps_start=0,\n)\nPyTorch Profiler callback to create snapshots of GPU memory usage at specified steps."
+    "text": "Name\nDescription\n\n\n\n\nLoRA_MLP\nOptimized LoRA MLP implementation.\n\n\nLoRA_O\nOptimized LoRA implementation for output projection.\n\n\nLoRA_QKV\nOptimized LoRA QKV implementation with quantization support.\n\n\n\n\n\nkernels.lora.LoRA_MLP()\nOptimized LoRA MLP implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nPerforms backward pass computation for LoRA MLP.\n\n\nforward\nForward pass for LoRA MLP.\n\n\n\n\n\nkernels.lora.LoRA_MLP.backward(ctx, grad_output)\nPerforms backward pass computation for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nContext object storing tensors saved during forward pass\nrequired\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to layer output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor | None\nTuple containing gradients for all inputs from forward pass:\n\n\n\nNone\n- Input gradient tensor (or None)\n\n\n\nNone\n- None for weights/biases/quantization states\n\n\n\nNone\n- LoRA A/B matrix gradients (or None)\n\n\n\ntorch.Tensor | None\n- None for scaling factors\n\n\n\ntorch.Tensor | None\n- None for activation functions and flags\n\n\n\n\n\n\n\nkernels.lora.LoRA_MLP.forward(\n    ctx,\n    X,\n    gate_weight,\n    gate_bias,\n    gate_quant,\n    gate_A,\n    gate_B,\n    gate_scale,\n    up_weight,\n    up_bias,\n    up_quant,\n    up_A,\n    up_B,\n    up_scale,\n    down_weight,\n    down_bias,\n    down_quant,\n    down_A,\n    down_B,\n    down_scale,\n    activation_fn,\n    activation_fn_backward,\n    inplace=True,\n)\nForward pass for LoRA MLP.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\n\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput features\nrequired\n\n\ngate_weight\ntorch.Tensor\nGate projection weight\nrequired\n\n\ngate_bias\ntorch.Tensor | None\nGate projection bias\nrequired\n\n\ngate_quant\nQuantState | None\nGate quantization state\nrequired\n\n\ngate_A\ntorch.Tensor | None\nGate LoRA A matrix\nrequired\n\n\ngate_B\ntorch.Tensor | None\nGate LoRA B matrix\nrequired\n\n\ngate_scale\nfloat\nGate LoRA scale\nrequired\n\n\nup_weight\ntorch.Tensor\nUp projection weight\nrequired\n\n\nup_quant\nQuantState | None\nUp projection quantization state\nrequired\n\n\nup_A\ntorch.Tensor | None\nUp projection LoRA A matrix\nrequired\n\n\nup_B\ntorch.Tensor | None\nUp projection LoRA B matrix\nrequired\n\n\nup_scale\nfloat\nUp projection LoRA scale\nrequired\n\n\ndown_weight\ntorch.Tensor\nDown projection weight\nrequired\n\n\ndown_bias\ntorch.Tensor | None\nDown projection bias\nrequired\n\n\ndown_quant\nQuantState | None\nDown projection quantization state\nrequired\n\n\ndown_A\ntorch.Tensor | None\nDown projection LoRA A matrix\nrequired\n\n\ndown_B\ntorch.Tensor | None\nDown projection LoRA B matrix\nrequired\n\n\ndown_scale\nfloat\nDown projection LoRA scale\nrequired\n\n\nactivation_fn\nCallable\nForward activation function\nrequired\n\n\nactivation_fn_backward\nCallable\nBackward activation function\nrequired\n\n\ninplace\nbool | None\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput transformed by multi-layer perceptron and activation function\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_O()\nOptimized LoRA implementation for output projection.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA output projection.\n\n\nforward\nForward pass for output projection with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_O.backward(ctx, dY)\nBackward pass computing gradients for LoRA output projection.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\ndY\ntorch.Tensor\nGradient of loss with respect to output\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor, torch.Tensor, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_O.forward(ctx, X, W, b, W_quant, A, B, s)\nForward pass for output projection with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nW\ntorch.Tensor\nOutput projection weight\nrequired\n\n\nb\ntorch.Tensor\nOutput projection bias\nrequired\n\n\nW_quant\nQuantState | None\nWeight quantization state\nrequired\n\n\nA\ntorch.Tensor\nLoRA A matrix\nrequired\n\n\nB\ntorch.Tensor\nLoRA B matrix\nrequired\n\n\ns\nfloat\nLoRA scaling factor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput projection result\n\n\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV()\nOptimized LoRA QKV implementation with quantization support.\nImplements efficient computation of query, key, value projections with LoRA,\nsupporting quantization and memory optimization.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass computing gradients for LoRA QKV.\n\n\nforward\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\nkernels.lora.LoRA_QKV.backward(ctx, q_grad, k_grad, v_grad)\nBackward pass computing gradients for LoRA QKV.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nq_grad\ntorch.Tensor\nGradient for query projection\nrequired\n\n\nk_grad\ntorch.Tensor\nGradient for key projection\nrequired\n\n\nv_grad\ntorch.Tensor\nGradient for value projection\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None, None, None, torch.Tensor | None, torch.Tensor | None, None, None]\nTuple containing gradients for all forward inputs\n\n\n\n\n\n\n\nkernels.lora.LoRA_QKV.forward(\n    ctx,\n    X,\n    q_weight,\n    q_bias,\n    q_quant,\n    q_A,\n    q_B,\n    q_scale,\n    k_weight,\n    k_bias,\n    k_quant,\n    k_A,\n    k_B,\n    k_scale,\n    v_weight,\n    v_bias,\n    v_quant,\n    v_A,\n    v_B,\n    v_scale,\n    inplace=True,\n)\nForward pass computing Q, K, V projections with LoRA.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nctx\ntorch.autograd.function.FunctionCtx\nAutograd context\nrequired\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\nq_weight\ntorch.Tensor\nQuery projection weight\nrequired\n\n\nq_bias\ntorch.Tensor | None\nQuery projection bias\nrequired\n\n\nq_quant\nQuantState | None\nQuery quantization state\nrequired\n\n\nq_A\ntorch.Tensor | None\nQuery LoRA A matrix\nrequired\n\n\nq_B\ntorch.Tensor | None\nQuery LoRA B matrix\nrequired\n\n\nq_scale\nfloat\nQuery LoRA scale\nrequired\n\n\nk_weight\ntorch.Tensor\nKey projection weight\nrequired\n\n\nk_bias\ntorch.Tensor | None\nKey projection bias\nrequired\n\n\nk_quant\nQuantState | None\nKey quantization state\nrequired\n\n\nk_A\ntorch.Tensor | None\nKey LoRA A matrix\nrequired\n\n\nk_B\ntorch.Tensor | None\nKey LoRA B matrix\nrequired\n\n\nk_scale\nfloat\nKey LoRA scale\nrequired\n\n\nv_weight\ntorch.Tensor\nValue projection weight\nrequired\n\n\nv_bias\ntorch.Tensor | None\nValue projection bias\nrequired\n\n\nv_quant\nQuantState | None\nValue quantization state\nrequired\n\n\nv_A\ntorch.Tensor | None\nValue LoRA A matrix\nrequired\n\n\nv_B\ntorch.Tensor | None\nValue LoRA B matrix\nrequired\n\n\nv_scale\nfloat\nValue LoRA scale\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors"
   },
   {
-    "objectID": "docs/api/utils.schemas.enums.html",
-    "href": "docs/api/utils.schemas.enums.html",
-    "title": "utils.schemas.enums",
+    "objectID": "docs/api/kernels.lora.html#functions",
+    "href": "docs/api/kernels.lora.html#functions",
+    "title": "kernels.lora",
     "section": "",
-    "text": "utils.schemas.enums\nEnums for Axolotl input config\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\nTorchIntDType\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations\n\n\n\nutils.schemas.enums.TorchIntDType()\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4"
+    "text": "Name\nDescription\n\n\n\n\napply_lora_mlp_geglu\nApplies LoRA to MLP layer with GEGLU activation.\n\n\napply_lora_mlp_swiglu\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\napply_lora_o\nApplies LoRA to output projection layer.\n\n\napply_lora_qkv\nApplies LoRA to compute Query, Key, Value projections.\n\n\nget_lora_parameters\nGets LoRA parameters from a projection module.\n\n\nmatmul_lora\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\nkernels.lora.apply_lora_mlp_geglu(self, X, inplace=True)\nApplies LoRA to MLP layer with GEGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with GEGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_mlp_swiglu(self, X, inplace=True)\nApplies LoRA to MLP layer with SwiGLU activation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor for the MLP layer\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place to save memory\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nOutput tensor after applying LoRA-adapted MLP with SwiGLU activation\n\n\n\n\n\n\n\nkernels.lora.apply_lora_o(self, X)\nApplies LoRA to output projection layer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nTransformed output tensor\n\n\n\n\n\n\n\nkernels.lora.apply_lora_qkv(self, X, inplace=True)\nApplies LoRA to compute Query, Key, Value projections.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor\nrequired\n\n\ninplace\nbool\nWhether to perform operations in-place\nTrue\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple of (Query, Key, Value) projection tensors\n\n\n\n\n\n\n\nkernels.lora.get_lora_parameters(proj)\nGets LoRA parameters from a projection module.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nproj\nnn.Module\nThe projection module to extract parameters from.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nA tuple containing the base weights, quantization state, LoRA A and B weights,\n\n\n\ntorch.Tensor | None\nscaling factor, and base layer bias. Quant state, weights, and bias may be\n\n\n\nQuantState | None\nNone if not available.\n\n\n\n\n\n\n\nkernels.lora.matmul_lora(X, W, b, W_quant, A, B, s, out=None)\nEfficient fused matmul + LoRA computation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nX\ntorch.Tensor\nInput tensor [*, in_features]\nrequired\n\n\nW\ntorch.Tensor\nBase weight matrix [out_features, in_features]\nrequired\n\n\nW_quant\nQuantState | None\nQuantization state for W\nrequired\n\n\nA\ntorch.Tensor | None\nLoRA A matrix [rank, in_features]\nrequired\n\n\nB\ntorch.Tensor | None\nLoRA B matrix [out_features, rank]\nrequired\n\n\ns\nfloat | None\nLoRA scaling factor\nrequired\n\n\nout\ntorch.Tensor | None\nOptional output tensor for inplace operations\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nResult of X @ W + X @ A @ B"
   },
   {
-    "objectID": "docs/api/utils.schemas.enums.html#classes",
-    "href": "docs/api/utils.schemas.enums.html#classes",
-    "title": "utils.schemas.enums",
+    "objectID": "docs/api/utils.collators.batching.html",
+    "href": "docs/api/utils.collators.batching.html",
+    "title": "utils.collators.batching",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nChatTemplate\nChat templates configuration subset\n\n\nCustomSupportedOptimizers\nCustom supported optimizers\n\n\nRLType\nRL trainer type configuration subset\n\n\nRingAttnFunc\nEnum class for supported ring-flash-attn implementations\n\n\nTorchIntDType\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4\n\n\n\n\n\nutils.schemas.enums.ChatTemplate()\nChat templates configuration subset\n\n\n\nutils.schemas.enums.CustomSupportedOptimizers()\nCustom supported optimizers\n\n\n\nutils.schemas.enums.RLType()\nRL trainer type configuration subset\n\n\n\nutils.schemas.enums.RingAttnFunc()\nEnum class for supported ring-flash-attn implementations\n\n\n\nutils.schemas.enums.TorchIntDType()\nTorch integer data types - getattr guards against torch &lt; 2.6 which does not support int4"
+    "text": "utils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\n\n\n\nName\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
   },
   {
-    "objectID": "docs/api/models.mamba.modeling_mamba.html",
-    "href": "docs/api/models.mamba.modeling_mamba.html",
-    "title": "models.mamba.modeling_mamba",
+    "objectID": "docs/api/utils.collators.batching.html#classes",
+    "href": "docs/api/utils.collators.batching.html#classes",
+    "title": "utils.collators.batching",
     "section": "",
-    "text": "models.mamba.modeling_mamba\nmodels.mamba.modeling_mamba"
+    "text": "Name\nDescription\n\n\n\n\nBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nDataCollatorForSeq2Seq\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\nPretrainingBatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\nV2BatchSamplerDataCollatorForSeq2Seq\nCollator for multipack specific to the using the BatchSampler\n\n\n\n\n\nutils.collators.batching.BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.DataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n)\nData collator that will dynamically pad the inputs received, as well as the labels and position_ids\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n[PreTrainedTokenizer] or [PreTrainedTokenizerFast]\nThe tokenizer used for encoding the data.\nrequired\n\n\nmodel\n[PreTrainedModel]\nThe model that is being trained. If set and has the prepare_decoder_input_ids_from_labels, use it to prepare the decoder_input_ids This is useful when using label_smoothing to avoid calculating loss twice.\nNone\n\n\npadding\nbool, str or [~utils.PaddingStrategy], optional, defaults to True\nSelect a strategy to pad the returned sequences (according to the model’s padding side and padding index) among: - True or 'longest' (default): Pad to the longest sequence in the batch (or no padding if only a single sequence is provided). - 'max_length': Pad to a maximum length specified with the argument max_length or to the maximum acceptable input length for the model if that argument is not provided. - False or 'do_not_pad': No padding (i.e., can output a batch with sequences of different lengths).\nTrue\n\n\nmax_length\nint, optional\nMaximum length of the returned list and optionally padding length (see above).\nNone\n\n\npad_to_multiple_of\nint, optional\nIf set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability &gt;= 7.5 (Volta).\nNone\n\n\nlabel_pad_token_id\nint, optional, defaults to -100\nThe id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).\n-100\n\n\nreturn_tensors\nstr\nThe type of Tensor to return. Allowable values are “np”, “pt” and “tf”.\n'pt'\n\n\n\n\n\n\n\nutils.collators.batching.PretrainingBatchSamplerDataCollatorForSeq2Seq(\n    *args,\n    multipack_attn=True,\n    **kwargs,\n)\nCollator for multipack specific to the using the BatchSampler\n\n\n\nutils.collators.batching.V2BatchSamplerDataCollatorForSeq2Seq(\n    tokenizer,\n    model=None,\n    padding=True,\n    max_length=None,\n    pad_to_multiple_of=None,\n    label_pad_token_id=-100,\n    position_pad_token_id=0,\n    return_tensors='pt',\n    squash_position_ids=False,\n)\nCollator for multipack specific to the using the BatchSampler"
   },
   {
-    "objectID": "docs/api/core.chat.format.shared.html",
-    "href": "docs/api/core.chat.format.shared.html",
-    "title": "core.chat.format.shared",
+    "objectID": "docs/api/prompt_strategies.completion.html",
+    "href": "docs/api/prompt_strategies.completion.html",
+    "title": "prompt_strategies.completion",
     "section": "",
-    "text": "core.chat.format.shared\ncore.chat.format.shared\nshared functions for format transforms"
+    "text": "prompt_strategies.completion\nBasic completion text\n\n\n\n\n\nName\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
   },
   {
-    "objectID": "docs/api/logging_config.html",
-    "href": "docs/api/logging_config.html",
-    "title": "logging_config",
+    "objectID": "docs/api/prompt_strategies.completion.html#classes",
+    "href": "docs/api/prompt_strategies.completion.html#classes",
+    "title": "prompt_strategies.completion",
     "section": "",
-    "text": "logging_config\nCommon logging module for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlLogger\nA Logger that automatically rejects non-axolotl INFOs.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nA Logger that automatically rejects non-axolotl INFOs.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\nAllows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL)\nDrops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default)\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+    "text": "Name\nDescription\n\n\n\n\nCompletionPromptTokenizingStrategy\nTokenizing strategy for Completion prompts.\n\n\nCompletionPrompter\nPrompter for completion\n\n\n\n\n\nprompt_strategies.completion.CompletionPromptTokenizingStrategy(\n    *args,\n    max_length=None,\n    **kwargs,\n)\nTokenizing strategy for Completion prompts.\n\n\n\nprompt_strategies.completion.CompletionPrompter()\nPrompter for completion"
   },
   {
-    "objectID": "docs/api/logging_config.html#classes",
-    "href": "docs/api/logging_config.html#classes",
-    "title": "logging_config",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html",
+    "href": "docs/api/core.trainers.grpo.trainer.html",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlLogger\nA Logger that automatically rejects non-axolotl INFOs.\n\n\nAxolotlOrWarnErrorFilter\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\n\n\nColorfulFormatter\nFormatter to add coloring to log messages by log type\n\n\n\n\n\nlogging_config.AxolotlLogger(name, level=logging.NOTSET)\nA Logger that automatically rejects non-axolotl INFOs.\n\n\n\nlogging_config.AxolotlOrWarnErrorFilter(**kwargs)\nAllows ANY WARNING or higher (unless overridden by LOG_LEVEL)\nAllows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL)\nDrops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default)\n\n\n\nlogging_config.ColorfulFormatter()\nFormatter to add coloring to log messages by log type"
+    "text": "core.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/logging_config.html#functions",
-    "href": "docs/api/logging_config.html#functions",
-    "title": "logging_config",
+    "objectID": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "href": "docs/api/core.trainers.grpo.trainer.html#classes",
+    "title": "core.trainers.grpo.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nconfigure_logging\nConfigure with default logging\n\n\n\n\n\nlogging_config.configure_logging()\nConfigure with default logging"
+    "text": "Name\nDescription\n\n\n\n\nAxolotlGRPOSequenceParallelTrainer\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\nAxolotlGRPOTrainer\nExtend the base GRPOTrainer for axolotl helpers\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer(\n    model,\n    reward_funcs,\n    args=None,\n    train_dataset=None,\n    eval_dataset=None,\n    processing_class=None,\n    reward_processing_classes=None,\n    callbacks=None,\n    optimizers=(None, None),\n    peft_config=None,\n    optimizer_cls_and_kwargs=None,\n)\nExtend the base GRPOTrainer for sequence parallelism handling\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_train_dataloader\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOSequenceParallelTrainer.get_train_dataloader(\n)\nGet dataloader for training\n\n\n\n\n\ncore.trainers.grpo.trainer.AxolotlGRPOTrainer(*args, **kwargs)\nExtend the base GRPOTrainer for axolotl helpers"
   },
   {
-    "objectID": "docs/api/loaders.model.html",
-    "href": "docs/api/loaders.model.html",
-    "title": "loaders.model",
+    "objectID": "docs/api/common.architectures.html",
+    "href": "docs/api/common.architectures.html",
+    "title": "common.architectures",
     "section": "",
-    "text": "loaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
+    "text": "common.architectures\ncommon.architectures\nCommon architecture specific constants"
   },
   {
-    "objectID": "docs/api/loaders.model.html#classes",
-    "href": "docs/api/loaders.model.html#classes",
-    "title": "loaders.model",
+    "objectID": "docs/api/cli.utils.html",
+    "href": "docs/api/cli.utils.html",
+    "title": "cli.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModelLoader\nManages model configuration, initialization and application of patches during\n\n\n\n\n\nloaders.model.ModelLoader(\n    cfg,\n    tokenizer,\n    *,\n    inference=False,\n    reference_model=False,\n    **kwargs,\n)\nManages model configuration, initialization and application of patches during\nmodel loading.\nThis class orchestrates the entire process of loading a model from configuration to\nfinal preparation. It handles device mapping, quantization, attention mechanisms,\nadapter integration, and various optimizations.\n\n\n\nLoading and validating model configuration\nApplying monkey patches for optimizations / fixes\nSetting up device mapping (including multi-GPU configurations)\nConfiguring quantization\nSetting attention mechanisms (Flash Attention, SDPA, etc.)\nLoading and initializing the model\nApplying adapters (LoRA, QLoRA, etc.)\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nmodel\nPreTrainedModel | PeftModel | PeftMixedModel\nThe loaded model instance (available after load() is called).\n\n\nmodel_kwargs\ndict[str, Any]\nDictionary of keyword arguments passed to model initialization.\n\n\nbase_model\n\nName or path of the base model to load.\n\n\nmodel_type\n\nType of model to load (e.g., AutoModelForCausalLM).\n\n\nmodel_config\n\nConfiguration object for the model.\n\n\nauto_model_loader\n\nclass used for loading the model (default: AutoModelForCausalLM).\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\nloaders.model.ModelLoader.load()\nLoad and prepare the model with all configurations and patches.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]\nA tuple with the loaded model and its LoRA configuration (if applicable)."
+    "text": "cli.utils\ncli.utils\nInit for axolotl.cli.utils module."
   },
   {
-    "objectID": "docs/api/monkeypatch.llama_expand_mask.html",
-    "href": "docs/api/monkeypatch.llama_expand_mask.html",
-    "title": "monkeypatch.llama_expand_mask",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "monkeypatch.llama_expand_mask\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf"
+    "text": "prompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\n\n\n\nName\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
   },
   {
-    "objectID": "docs/api/core.trainers.base.html",
-    "href": "docs/api/core.trainers.base.html",
-    "title": "core.trainers.base",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html#classes",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "core.trainers.base\nModule for customized trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details."
+    "text": "Name\nDescription\n\n\n\n\nMessage\nmessage/turn\n\n\nMessageList\nconversation\n\n\nORPODatasetParsingStrategy\nStrategy to parse chosen rejected dataset into messagelist\n\n\nORPOPrompter\nSingle Turn prompter for ORPO\n\n\nORPOTokenizingStrategy\nrejected_input_ids\n\n\n\n\n\nprompt_strategies.orpo.chat_template.Message()\nmessage/turn\n\n\n\nprompt_strategies.orpo.chat_template.MessageList()\nconversation\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy()\nStrategy to parse chosen rejected dataset into messagelist\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_chosen_conversation_thread\nDataset structure mappings\n\n\nget_prompt\nMap the data to extract everything up to the last turn\n\n\nget_rejected_conversation_thread\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_chosen_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_prompt(\n    prompt,\n)\nMap the data to extract everything up to the last turn\n\n\n\nprompt_strategies.orpo.chat_template.ORPODatasetParsingStrategy.get_rejected_conversation_thread(\n    prompt,\n)\nDataset structure mappings\n\n\n\n\n\nprompt_strategies.orpo.chat_template.ORPOPrompter(chat_template, tokenizer)\nSingle Turn prompter for ORPO\n\n\n\nprompt_strategies.orpo.chat_template.ORPOTokenizingStrategy(\n    *args,\n    dataset_parser=None,\n    **kwargs,\n)\nrejected_input_ids\ninput_ids\nrejected_attention_mask\nattention_mask\nrejected_labels\nlabels"
   },
   {
-    "objectID": "docs/api/core.trainers.base.html#classes",
-    "href": "docs/api/core.trainers.base.html#classes",
-    "title": "core.trainers.base",
+    "objectID": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
+    "href": "docs/api/prompt_strategies.orpo.chat_template.html#functions",
+    "title": "prompt_strategies.orpo.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlTrainer\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer(\n    *_args,\n    bench_data_collator=None,\n    eval_data_collator=None,\n    dataset_tags=None,\n    **kwargs,\n)\nExtend the base Trainer for axolotl helpers\n\n\n\n\n\nName\nDescription\n\n\n\n\nlog\nLog logs on the various objects watching training, including stored metrics.\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.log(logs, start_time=None)\nLog logs on the various objects watching training, including stored metrics.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nlogs\ndict[str, float]\nThe values to log.\nrequired\n\n\nstart_time\nfloat | None\nThe start of training.\nNone\n\n\n\n\n\n\n\ncore.trainers.base.AxolotlTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing the\nmodel on the Hub. Please refer to ~transformers.Trainer.push_to_hub for more details."
+    "text": "Name\nDescription\n\n\n\n\nload\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.orpo.chat_template.load(tokenizer, cfg, ds_cfg=None, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected"
   },
   {
-    "objectID": "docs/api/utils.tokenization.html",
-    "href": "docs/api/utils.tokenization.html",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/core.training_args.html",
+    "href": "docs/api/core.training_args.html",
+    "title": "core.training_args",
     "section": "",
-    "text": "utils.tokenization\nModule for tokenization utilities\n\n\n\n\n\nName\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "core.training_args\nextra axolotl specific training args\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
   },
   {
-    "objectID": "docs/api/utils.tokenization.html#functions",
-    "href": "docs/api/utils.tokenization.html#functions",
-    "title": "utils.tokenization",
+    "objectID": "docs/api/core.training_args.html#classes",
+    "href": "docs/api/core.training_args.html#classes",
+    "title": "core.training_args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ncolor_token_for_rl_debug\nHelper function to color tokens based on their type.\n\n\nprocess_tokens_for_rl_debug\nHelper function to process and color tokens.\n\n\n\n\n\nutils.tokenization.color_token_for_rl_debug(\n    decoded_token,\n    encoded_token,\n    color,\n    text_only,\n)\nHelper function to color tokens based on their type.\n\n\n\nutils.tokenization.process_tokens_for_rl_debug(\n    tokens,\n    color,\n    tokenizer,\n    text_only,\n)\nHelper function to process and color tokens."
+    "text": "Name\nDescription\n\n\n\n\nAxolotlCPOConfig\nCPO config for CPO training\n\n\nAxolotlKTOConfig\nKTO config for KTO training\n\n\nAxolotlORPOConfig\nORPO config for ORPO training\n\n\nAxolotlPRMConfig\nPRM config for PRM training\n\n\nAxolotlRewardConfig\nReward config for Reward training\n\n\nAxolotlTrainingArguments\nTraining arguments for Causal trainer\n\n\n\n\n\ncore.training_args.AxolotlCPOConfig(simpo_gamma=None)\nCPO config for CPO training\n\n\n\ncore.training_args.AxolotlKTOConfig()\nKTO config for KTO training\n\n\n\ncore.training_args.AxolotlORPOConfig()\nORPO config for ORPO training\n\n\n\ncore.training_args.AxolotlPRMConfig()\nPRM config for PRM training\n\n\n\ncore.training_args.AxolotlRewardConfig()\nReward config for Reward training\n\n\n\ncore.training_args.AxolotlTrainingArguments()\nTraining arguments for Causal trainer\nThis code is duplicated due to HF TrainingArguments not setting output_dir with a\ndefault value so it can’t be used as a mixin."
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.llama3.html",
-    "href": "docs/api/prompt_strategies.kto.llama3.html",
-    "title": "prompt_strategies.kto.llama3",
+    "objectID": "docs/api/utils.schemas.datasets.html",
+    "href": "docs/api/utils.schemas.datasets.html",
+    "title": "utils.schemas.datasets",
     "section": "",
-    "text": "prompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "utils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.kto.llama3.html#functions",
-    "title": "prompt_strategies.kto.llama3",
+    "objectID": "docs/api/utils.schemas.datasets.html#classes",
+    "href": "docs/api/utils.schemas.datasets.html#classes",
+    "title": "utils.schemas.datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.llama3.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "Name\nDescription\n\n\n\n\nDPODataset\nDPO configuration subset\n\n\nKTODataset\nKTO configuration subset\n\n\nPretrainingDataset\nPretraining dataset configuration subset\n\n\nSFTDataset\nSFT configuration subset\n\n\nStepwiseSupervisedDataset\nStepwise supervised dataset configuration subset\n\n\nUserDefinedDPOType\nUser defined typing for DPO\n\n\nUserDefinedKTOType\nUser defined typing for KTO\n\n\nUserDefinedPrompterType\nStructure for user defined prompt types\n\n\n\n\n\nutils.schemas.datasets.DPODataset()\nDPO configuration subset\n\n\n\nutils.schemas.datasets.KTODataset()\nKTO configuration subset\n\n\n\nutils.schemas.datasets.PretrainingDataset()\nPretraining dataset configuration subset\n\n\n\nutils.schemas.datasets.SFTDataset()\nSFT configuration subset\n\n\n\n\n\nName\nDescription\n\n\n\n\nhandle_legacy_message_fields\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.SFTDataset.handle_legacy_message_fields(data)\nHandle backwards compatibility between legacy message field mapping and new property mapping system.\n\n\n\n\n\nutils.schemas.datasets.StepwiseSupervisedDataset()\nStepwise supervised dataset configuration subset\n\n\n\nutils.schemas.datasets.UserDefinedDPOType()\nUser defined typing for DPO\n\n\n\nutils.schemas.datasets.UserDefinedKTOType()\nUser defined typing for KTO\n\n\n\nutils.schemas.datasets.UserDefinedPrompterType()\nStructure for user defined prompt types"
   },
   {
-    "objectID": "docs/api/convert.html",
-    "href": "docs/api/convert.html",
-    "title": "convert",
+    "objectID": "docs/api/prompt_strategies.chat_template.html",
+    "href": "docs/api/prompt_strategies.chat_template.html",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "convert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
+    "text": "prompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
   },
   {
-    "objectID": "docs/api/convert.html#classes",
-    "href": "docs/api/convert.html#classes",
-    "title": "convert",
+    "objectID": "docs/api/prompt_strategies.chat_template.html#classes",
+    "href": "docs/api/prompt_strategies.chat_template.html#classes",
+    "title": "prompt_strategies.chat_template",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nFileReader\nReads a file and returns its contents as a string\n\n\nFileWriter\nWrites a string to a file\n\n\nJsonParser\nParses a string as JSON and returns the result\n\n\nJsonToJsonlConverter\nConverts a JSON file to JSONL\n\n\nJsonlSerializer\nSerializes a list of JSON objects into a JSONL string\n\n\nStdoutWriter\nWrites a string to stdout\n\n\n\n\n\nconvert.FileReader()\nReads a file and returns its contents as a string\n\n\n\nconvert.FileWriter(file_path)\nWrites a string to a file\n\n\n\nconvert.JsonParser()\nParses a string as JSON and returns the result\n\n\n\nconvert.JsonToJsonlConverter(\n    file_reader,\n    file_writer,\n    json_parser,\n    jsonl_serializer,\n)\nConverts a JSON file to JSONL\n\n\n\nconvert.JsonlSerializer()\nSerializes a list of JSON objects into a JSONL string\n\n\n\nconvert.StdoutWriter()\nWrites a string to stdout"
+    "text": "Name\nDescription\n\n\n\n\nChatTemplatePrompter\nPrompter for HF chat templates\n\n\nChatTemplateStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nMistralPrompter\nMistral prompter for chat template.\n\n\nMistralStrategy\nMistral strategy for chat template.\n\n\nStrategyLoader\nLoad chat template strategy based on configuration.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter(\n    tokenizer,\n    chat_template,\n    processor=None,\n    max_length=2048,\n    message_property_mappings=None,\n    message_field_training=None,\n    message_field_training_detail=None,\n    field_messages='messages',\n    field_system='system',\n    field_tools='tools',\n    field_thinking='reasoning_content',\n    roles=None,\n    template_thinking_key='reasoning_content',\n    chat_template_kwargs=None,\n    drop_system_message=False,\n)\nPrompter for HF chat templates\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_prompt\nBuild a prompt from a conversation.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplatePrompter.build_prompt(\n    conversation,\n    add_generation_prompt=False,\n    images=None,\n    tools=None,\n)\nBuild a prompt from a conversation.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconversation\nlist[dict]\nA list of messages.\nrequired\n\n\nadd_generation_prompt\n\nWhether to add a generation prompt.\nFalse\n\n\nimages\n\nA list of images. (optional)\nNone\n\n\ntools\n\nA list of tools. (optional)\nNone\n\n\n\n\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\nfind_turn\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\ntokenize_prompt\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.find_turn(\n    turns,\n    turn_idx,\n    tools=None,\n)\nLocate the starting and ending indices of the specified turn in a conversation.\n\n\n\nprompt_strategies.chat_template.ChatTemplateStrategy.tokenize_prompt(prompt)\nPublic method that can handle either a single prompt or a batch of prompts.\n\n\n\n\n\nprompt_strategies.chat_template.MistralPrompter(*args, **kwargs)\nMistral prompter for chat template.\n\n\n\nprompt_strategies.chat_template.MistralStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs,\n    sequence_len,\n    roles_to_train=None,\n    train_on_eos=None,\n    train_on_eot=None,\n    eot_tokens=None,\n    split_thinking=False,\n)\nMistral strategy for chat template.\n\n\n\n\n\nName\nDescription\n\n\n\n\nfind_first_eot_token\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.MistralStrategy.find_first_eot_token(\n    input_ids,\n    start_idx,\n)\nFind the first EOT token in the input_ids starting from start_idx.\n\n\n\n\n\nprompt_strategies.chat_template.StrategyLoader()\nLoad chat template strategy based on configuration."
   },
   {
-    "objectID": "docs/api/datasets.html",
-    "href": "docs/api/datasets.html",
-    "title": "datasets",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "datasets\nModule containing Dataset functionality\n\n\n\n\n\nName\nDescription\n\n\n\n\nConstantLengthDataset\nIterable dataset that returns constant length chunks of tokens from stream of\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.ConstantLengthDataset(tokenizer, datasets, seq_length=2048)\nIterable dataset that returns constant length chunks of tokens from stream of\ntext files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n\nThe processor used for processing the data.\nrequired\n\n\ndataset\n\nDataset with text files.\nrequired\n\n\nseq_length\n\nLength of token sequences to return.\n2048\n\n\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
+    "text": "prompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\n\n\n\nName\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/datasets.html#classes",
-    "href": "docs/api/datasets.html#classes",
-    "title": "datasets",
+    "objectID": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "href": "docs/api/prompt_strategies.alpaca_w_system.html#classes",
+    "title": "prompt_strategies.alpaca_w_system",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nConstantLengthDataset\nIterable dataset that returns constant length chunks of tokens from stream of\n\n\nTokenizedPromptDataset\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\ndatasets.ConstantLengthDataset(tokenizer, datasets, seq_length=2048)\nIterable dataset that returns constant length chunks of tokens from stream of\ntext files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntokenizer\n\nThe processor used for processing the data.\nrequired\n\n\ndataset\n\nDataset with text files.\nrequired\n\n\nseq_length\n\nLength of token sequences to return.\n2048\n\n\n\n\n\n\n\ndatasets.TokenizedPromptDataset(\n    prompt_tokenizer,\n    dataset,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nDataset that returns tokenized prompts from a stream of text files.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nprompt_tokenizer\nPromptTokenizingStrategy\nThe prompt tokenizing method for processing the data.\nrequired\n\n\ndataset\nDataset\nDataset with text files.\nrequired\n\n\nprocess_count\nint | None\nNumber of processes to use for tokenizing.\nNone\n\n\nkeep_in_memory\nbool | None\nWhether to keep the tokenized dataset in memory.\nFalse"
+    "text": "Name\nDescription\n\n\n\n\nInstructionWSystemPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nOpenOrcaPromptTokenizingStrategy\nTokenizing strategy for OpenOrca datasets\n\n\nOpenOrcaSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\nSystemDataPrompter\nAlpaca Style Prompter that uses system prompts from the dataset\n\n\n\n\n\nprompt_strategies.alpaca_w_system.InstructionWSystemPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenOrca datasets\n\n\n\nprompt_strategies.alpaca_w_system.OpenOrcaSystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts\n\n\n\nprompt_strategies.alpaca_w_system.SystemDataPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAlpaca Style Prompter that uses system prompts from the dataset"
   },
   {
-    "objectID": "docs/api/utils.collators.mamba.html",
-    "href": "docs/api/utils.collators.mamba.html",
-    "title": "utils.collators.mamba",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "utils.collators.mamba\ncollators for Mamba\n\n\n\n\n\nName\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
+    "text": "integrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/utils.collators.mamba.html#classes",
-    "href": "docs/api/utils.collators.mamba.html#classes",
-    "title": "utils.collators.mamba",
+    "objectID": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "href": "docs/api/integrations.cut_cross_entropy.args.html#classes",
+    "title": "integrations.cut_cross_entropy.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMambaDataCollator\nCollator for State Space Models (Mamba)\n\n\n\n\n\nutils.collators.mamba.MambaDataCollator(tokenizer)\nCollator for State Space Models (Mamba)"
+    "text": "Name\nDescription\n\n\n\n\nCutCrossEntropyArgs\nInput args for Cut Cross Entropy.\n\n\n\n\n\nintegrations.cut_cross_entropy.args.CutCrossEntropyArgs()\nInput args for Cut Cross Entropy."
   },
   {
-    "objectID": "docs/api/prompt_strategies.orcamini.html",
-    "href": "docs/api/prompt_strategies.orcamini.html",
-    "title": "prompt_strategies.orcamini",
+    "objectID": "docs/api/common.const.html",
+    "href": "docs/api/common.const.html",
+    "title": "common.const",
     "section": "",
-    "text": "prompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\nsee also https://huggingface.co/psmathur/orca_mini_v2_7b for more information\nUse dataset type: orcamini in conig.yml to use this prompt style.\nCompared to the alpaca_w_system.open_orca dataset type,\nthis one specifies the system prompt with “### System:”.\nNot suited/tested for multiple-turn conversations without further adjustments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
+    "text": "common.const\ncommon.const\nVarious shared constants"
   },
   {
-    "objectID": "docs/api/prompt_strategies.orcamini.html#classes",
-    "href": "docs/api/prompt_strategies.orcamini.html#classes",
-    "title": "prompt_strategies.orcamini",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html",
+    "href": "docs/api/monkeypatch.lora_kernels.html",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nOrcaMiniPrompter\nAdjusted Prompter for Orca Mini (v2) datasets\n\n\n\n\n\nprompt_strategies.orcamini.OrcaMiniPrompter(\n    prompt_style=PromptStyle.INSTRUCT.value,\n)\nAdjusted Prompter for Orca Mini (v2) datasets"
+    "text": "monkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\n\n\n\nName\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
   },
   {
-    "objectID": "docs/api/utils.schemas.config.html",
-    "href": "docs/api/utils.schemas.config.html",
-    "title": "utils.schemas.config",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html#classes",
+    "href": "docs/api/monkeypatch.lora_kernels.html#classes",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "utils.schemas.config\nModule with Pydantic models for configuration.\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nwrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nwrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
+    "text": "Name\nDescription\n\n\n\n\nFakeMLP\nplaceholder MLP for triton patching\n\n\n\n\n\nmonkeypatch.lora_kernels.FakeMLP(gate_proj, up_proj, down_proj)\nplaceholder MLP for triton patching"
   },
   {
-    "objectID": "docs/api/utils.schemas.config.html#classes",
-    "href": "docs/api/utils.schemas.config.html#classes",
-    "title": "utils.schemas.config",
+    "objectID": "docs/api/monkeypatch.lora_kernels.html#functions",
+    "href": "docs/api/monkeypatch.lora_kernels.html#functions",
+    "title": "monkeypatch.lora_kernels",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlConfigWCapabilities\nwrapper to valdiate GPU capabilities with the configured options\n\n\nAxolotlInputConfig\nWrapper of all config options.\n\n\n\n\n\nutils.schemas.config.AxolotlConfigWCapabilities()\nwrapper to valdiate GPU capabilities with the configured options\n\n\n\nutils.schemas.config.AxolotlInputConfig()\nWrapper of all config options."
+    "text": "Name\nDescription\n\n\n\n\napply_lora_kernel_patches\nApplies optimized Triton kernel patches to a PEFT model.\n\n\nget_attention_cls_from_config\nGet the appropriate attention class by inspecting the model config.\n\n\nget_layers\nGet the layers of the model. Handles text-only and multimodal models.\n\n\noriginal_apply_o\nOriginal implementation of output projection without optimizations.\n\n\noriginal_apply_qkv\nOriginal implementation of QKV projection without optimizations.\n\n\npatch_self_attn_lora\nGiven an axolotl config, this method patches the inferred attention class forward\n\n\n\n\n\nmonkeypatch.lora_kernels.apply_lora_kernel_patches(model, cfg)\nApplies optimized Triton kernel patches to a PEFT model.\nPatches a PEFT model with optimized implementations for MLP and attention\ncomputations. The optimizations include custom Triton kernels for activation\nfunctions and specialized autograd functions for LoRA computations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model to be patched with optimized kernels.\nrequired\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nPeftModelForCausalLM\nPeftModelForCausalLM\nThe patched model with optimized kernels.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTypeError\nIf the provided model is not a PeftModelForCausalLM.\n\n\n\nNotImplementedError\nIf the model type is not supported.\n\n\n\nAssertionError\nIf multiple adapters are active (currently unsupported).\n\n\n\n\n\n\nThe optimizations require LoRA adapters with no dropout and no bias terms. The\nfunction will skip patching if these conditions aren’t met.\n\n\n\n\nmonkeypatch.lora_kernels.get_attention_cls_from_config(cfg)\nGet the appropriate attention class by inspecting the model config.\nUses dynamic import to support any model architecture that follows\nthe standard transformers naming convention.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nType[nn.Module]\nThe appropriate attention class for the model.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf base_model not specified or attention class cannot be imported\n\n\n\nImportError\nIf the model module or attention class doesn’t exist\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.get_layers(model)\nGet the layers of the model. Handles text-only and multimodal models.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nPeftModelForCausalLM\nA PEFT model.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[nn.Module]\nA list of layers.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_o(self, hidden_states)\nOriginal implementation of output projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim]`.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nThe output projection result.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.original_apply_qkv(self, hidden_states)\nOriginal implementation of QKV projection without optimizations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nself\nnn.Module\nThe attention module instance.\nrequired\n\n\nhidden_states\ntorch.Tensor\nInput tensor of shape [batch_size, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nA tuple (query_states, key_states, value_states) containing the projected states for query, key, and value.\n\n\n\n\n\n\n\nmonkeypatch.lora_kernels.patch_self_attn_lora(cfg)\nGiven an axolotl config, this method patches the inferred attention class forward\npass with optimized LoRA implementations.\nIt modifies the attention class to use optimized QKV and output projections. The\noriginal implementation is preserved and can be restored if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf the required code blocks are not found in the attention implementation."
   },
   {
-    "objectID": "docs/api/core.trainers.dpo.trainer.html",
-    "href": "docs/api/core.trainers.dpo.trainer.html",
-    "title": "core.trainers.dpo.trainer",
+    "objectID": "docs/api/kernels.quantize.html",
+    "href": "docs/api/kernels.quantize.html",
+    "title": "kernels.quantize",
     "section": "",
-    "text": "core.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
+    "text": "kernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
   },
   {
-    "objectID": "docs/api/core.trainers.dpo.trainer.html#classes",
-    "href": "docs/api/core.trainers.dpo.trainer.html#classes",
-    "title": "core.trainers.dpo.trainer",
+    "objectID": "docs/api/kernels.quantize.html#functions",
+    "href": "docs/api/kernels.quantize.html#functions",
+    "title": "kernels.quantize",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlDPOTrainer\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer(*args, dataset_tags=None, **kwargs)\nExtend the base DPOTrainer for axolotl helpers.\n\n\n\n\n\nName\nDescription\n\n\n\n\npush_to_hub\nOverwrite the push_to_hub method in order to force-add the tags when pushing\n\n\n\n\n\ncore.trainers.dpo.trainer.AxolotlDPOTrainer.push_to_hub(*args, **kwargs)\nOverwrite the push_to_hub method in order to force-add the tags when pushing\nthe model on the Hub. Please refer to ~transformers.Trainer.push_to_hub\nfor more details."
+    "text": "Name\nDescription\n\n\n\n\ndequantize\nFast NF4 dequantization using bitsandbytes CUDA kernels.\n\n\n\n\n\nkernels.quantize.dequantize(W, quant_state=None, out=None)\nFast NF4 dequantization using bitsandbytes CUDA kernels.\nPerforms efficient dequantization of weights from NF4 format using bitsandbytes’\noptimized CUDA implementations. Supports both legacy list and new QuantState\nformats.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nW\ntorch.Tensor\nQuantized weight tensor to dequantize\nrequired\n\n\nquant_state\nQuantState | list | None\nQuantization state containing metadata needed for dequantization. Can be either a QuantState object or legacy list format. If None, returns W unchanged.\nNone\n\n\nout\ntorch.Tensor | None\nOptional output tensor for storing dequantized results. Must match expected shape and dtype if provided.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\nDequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if\n\n\n\ntorch.Tensor\ninput W was transposed.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAssertionError\nIf provided output tensor doesn’t match expected shape / dtype.\n\n\n\n\n\n\nUses CUDA streams for better performance when available in newer bitsandbytes\nversions (&gt;0.43.3)."
   },
   {
-    "objectID": "docs/api/utils.dict.html",
-    "href": "docs/api/utils.dict.html",
-    "title": "utils.dict",
+    "objectID": "docs/api/cli.config.html",
+    "href": "docs/api/cli.config.html",
+    "title": "cli.config",
     "section": "",
-    "text": "utils.dict\nModule containing the DictDefault class\n\n\n\n\n\nName\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
+    "text": "cli.config\nConfiguration loading and processing.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.dict.html#classes",
-    "href": "docs/api/utils.dict.html#classes",
-    "title": "utils.dict",
+    "objectID": "docs/api/cli.config.html#functions",
+    "href": "docs/api/cli.config.html#functions",
+    "title": "cli.config",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDictDefault\nA Dict that returns None instead of returning empty Dict for missing keys.\n\n\n\n\n\nutils.dict.DictDefault()\nA Dict that returns None instead of returning empty Dict for missing keys."
+    "text": "Name\nDescription\n\n\n\n\ncheck_remote_config\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\n\n\nchoose_config\nHelper method for choosing a axolotl config YAML file (considering only files\n\n\nload_cfg\nLoads the axolotl configuration stored at config, validates it, and performs\n\n\nprepare_plugins\nRegisters the plugins for the given configuration.\n\n\n\n\n\ncli.config.check_remote_config(config)\nFirst, determines if the passed config is a valid HTTPS URL. Then, attempts to query\nfor it and parse its content, first as JSON, then as YAML (YAML is preferred).\nFinally, the parsed content is written to a local file and its path is returned.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[str, Path]\nHTTPS URL to a YAML or JSON file.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion[str, Path]\nEither the original config if it’s not a valid HTTPS URL, or the path to the\n\n\n\nUnion[str, Path]\ndownloaded remote config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the remote configuration is neither valid JSON or YAML.\n\n\n\nRuntimeError\nIf some request-related exception occurs from the file download.\n\n\n\nException\nCatch-all for any other exception.\n\n\n\n\n\n\n\ncli.config.choose_config(path)\nHelper method for choosing a axolotl config YAML file (considering only files\nending with .yml or .yaml). If more than one config file exists in the passed\npath, the user is prompted to choose one.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\npath\nPath\nDirectory in which config file(s) are stored.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nstr\nPath to either (1) the sole YAML file, or (2) if more than one YAML files exist,\n\n\n\nstr\nthe user-selected YAML file.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf no YAML files are found in the given path.\n\n\n\n\n\n\n\ncli.config.load_cfg(config=Path('examples/'), **kwargs)\nLoads the axolotl configuration stored at config, validates it, and performs\nvarious setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr | Path | DictDefault\nPath (local or remote) to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDictDefault\nDictDefault mapping configuration keys to values.\n\n\n\n\n\n\n\ncli.config.prepare_plugins(cfg)\nRegisters the plugins for the given configuration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired"
   },
   {
-    "objectID": "docs/api/utils.dict.html#functions",
-    "href": "docs/api/utils.dict.html#functions",
-    "title": "utils.dict",
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html",
+    "href": "docs/api/prompt_strategies.llama2_chat.html",
+    "title": "prompt_strategies.llama2_chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nremove_none_values\nRemove null from a dictionary-like obj or list.\n\n\n\n\n\nutils.dict.remove_none_values(obj)\nRemove null from a dictionary-like obj or list.\nThese can appear due to Dataset loading causing schema merge.\nSee https://github.com/axolotl-ai-cloud/axolotl/pull/2909"
+    "text": "prompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\nsee also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.\nThis implementation is based on the Vicuna PR and the fastchat repo, see also:\nhttps://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847\nUse dataset type: “llama2_chat” in conig.yml to use this prompt style.\nE.g. in the config.yml:\ndatasets:\n  - path: llama_finetune_train.jsonl\n    type: llama2_chat\nThe dataset itself should look like this:\n{'conversations':[{\"from\": \"human\", \"value\": \"Who are you?\"}, {\"from\": \"gpt\", \"value\": \"I am Vicuna\"},...]}\nin a jsonl file. The first message should be from the human, the second from gpt.\nFor a custom system message, the first “from” can be “system” (followed by alternating “human” and “gpt” turns).\nImportant: Don’t use “special_tokens:” in your config.yml if you are not sure what you are doing!\n\n\n\n\n\nName\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
   },
   {
-    "objectID": "docs/api/utils.quantization.html",
-    "href": "docs/api/utils.quantization.html",
-    "title": "utils.quantization",
+    "objectID": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "href": "docs/api/prompt_strategies.llama2_chat.html#classes",
+    "title": "prompt_strategies.llama2_chat",
     "section": "",
-    "text": "utils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\nName\nDescription\n\n\n\n\nconvert_qat_model_for_ptq\nThis function is used to convert a swap fake-quantized modules in a model\n\n\nget_ptq_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model_for_ptq\nThis function is used to quantize a model for post-training quantization.\n\n\n\n\n\nutils.quantization.convert_qat_model_for_ptq(model, *, quantize_embedding=None)\nThis function is used to convert a swap fake-quantized modules in a model\nwhich has been trained with QAT back to the original modules, ready for PTQ.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to convert.\nrequired\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone\n\n\n\n\n\n\n\nutils.quantization.get_ptq_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint\nThe group size to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model_for_ptq(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model for post-training quantization.\nIt swaps the model’s linear layers with fake quantized linear layers.\nIf quantize_embedding is True, it will also swap the model’s embedding weights with fake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
+    "text": "Name\nDescription\n\n\n\n\nLLama2ChatTokenizingStrategy\nTokenizing strategy for Llama2 prompts.\n\n\nLlama2ChatConversation\nA class that manages prompt templates and keeps all conversation history.\n\n\nLlama2ChatPrompter\nA prompter that generates prompts for Llama2 models.\n\n\n\n\n\nprompt_strategies.llama2_chat.LLama2ChatTokenizingStrategy(*args, **kwargs)\nTokenizing strategy for Llama2 prompts.\nadapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation(\n    name='llama2',\n    system=\"[INST] &lt;&lt;SYS&gt;&gt;\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\n\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n&lt;&lt;/SYS&gt;&gt;\\n\\n\",\n    roles=('[INST]', '[/INST]'),\n    messages=list(),\n    offset=0,\n)\nA class that manages prompt templates and keeps all conversation history.\ncopied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py\n\n\n\n\n\nName\nDescription\n\n\n\n\nappend_message\nAppend a new message.\n\n\nget_prompt\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.append_message(\n    role,\n    message,\n)\nAppend a new message.\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatConversation.get_prompt()\nGet the prompt for generation.\n\n\n\n\n\nprompt_strategies.llama2_chat.Llama2ChatPrompter()\nA prompter that generates prompts for Llama2 models."
   },
   {
-    "objectID": "docs/api/utils.quantization.html#functions",
-    "href": "docs/api/utils.quantization.html#functions",
-    "title": "utils.quantization",
+    "objectID": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "href": "docs/api/prompt_strategies.dpo.zephyr.html",
+    "title": "prompt_strategies.dpo.zephyr",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nconvert_qat_model_for_ptq\nThis function is used to convert a swap fake-quantized modules in a model\n\n\nget_ptq_config\nThis function is used to build a post-training quantization config.\n\n\nprepare_model_for_qat\nThis function is used to prepare a model for QAT by swapping the model’s linear\n\n\nquantize_model_for_ptq\nThis function is used to quantize a model for post-training quantization.\n\n\n\n\n\nutils.quantization.convert_qat_model_for_ptq(model, *, quantize_embedding=None)\nThis function is used to convert a swap fake-quantized modules in a model\nwhich has been trained with QAT back to the original modules, ready for PTQ.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to convert.\nrequired\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone\n\n\n\n\n\n\n\nutils.quantization.get_ptq_config(\n    weight_dtype,\n    activation_dtype=None,\n    group_size=None,\n)\nThis function is used to build a post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nAOBaseConfig\nThe post-training quantization config.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation dtype is not specified and the weight dtype is not int8 or int4, or if the group size is not specified for int8 or int4 weight only quantization.\n\n\n\n\n\n\n\nutils.quantization.prepare_model_for_qat(\n    model,\n    weight_dtype,\n    group_size,\n    activation_dtype=None,\n    quantize_embedding=False,\n)\nThis function is used to prepare a model for QAT by swapping the model’s linear\nlayers with fake quantized linear layers, and optionally the embedding weights with\nfake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint\nThe group size to use for weight quantization.\nrequired\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool\nWhether to quantize the model’s embedding weights.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf the activation/weight dtype combination is invalid.\n\n\n\n\n\n\n\nutils.quantization.quantize_model_for_ptq(\n    model,\n    weight_dtype,\n    group_size=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n)\nThis function is used to quantize a model for post-training quantization.\nIt swaps the model’s linear layers with fake quantized linear layers.\nIf quantize_embedding is True, it will also swap the model’s embedding weights with fake quantized embedding weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\n\nThe model to quantize.\nrequired\n\n\nweight_dtype\nTorchIntDType\nThe dtype to use for weight quantization.\nrequired\n\n\ngroup_size\nint | None\nThe group size to use for weight quantization.\nNone\n\n\nactivation_dtype\nTorchIntDType | None\nThe dtype to use for activation quantization.\nNone\n\n\nquantize_embedding\nbool | None\nWhether to quantize the model’s embedding weights.\nNone"
+    "text": "prompt_strategies.dpo.zephyr\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html",
-    "title": "monkeypatch.gradient_checkpointing.offload_disk",
+    "objectID": "docs/api/kernels.geglu.html",
+    "href": "docs/api/kernels.geglu.html",
+    "title": "kernels.geglu",
     "section": "",
-    "text": "monkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\nName\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
+    "text": "kernels.geglu\nModule for definition of GEGLU Triton kernels.\nSee “GLU Variants Improve Transformer” (https://arxiv.org/abs/2002.05202).\nCredit to unsloth (https://unsloth.ai/) for inspiration for this implementation.\n\n\n\n\n\nName\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_disk.html#classes",
-    "title": "monkeypatch.gradient_checkpointing.offload_disk",
+    "objectID": "docs/api/kernels.geglu.html#functions",
+    "href": "docs/api/kernels.geglu.html#functions",
+    "title": "kernels.geglu",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nDisco\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\nDiskOffloadManager\nManages offloaded tensors and handles prefetching in a separate thread.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco()\nDisco: DIsk-based Storage and Checkpointing with Optimized prefetching\nAdvanced disk-based gradient checkpointer with prefetching.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbackward\nBackward pass that loads activations from disk with prefetching\n\n\nforward\nForward pass that offloads activations to disk asynchronously\n\n\nget_instance\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.backward(\n    ctx,\n    *grad_outputs,\n)\nBackward pass that loads activations from disk with prefetching\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.forward(\n    ctx,\n    forward_function,\n    hidden_states,\n    *args,\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nForward pass that offloads activations to disk asynchronously\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.Disco.get_instance(\n    prefetch_size=1,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nGet or create the offload manager\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager(\n    prefetch_size=3,\n    prefetch_to_gpu=True,\n    save_workers=4,\n)\nManages offloaded tensors and handles prefetching in a separate thread.\nIncludes synchronization to prevent race conditions.\n\n\n\n\n\nName\nDescription\n\n\n\n\ncleanup\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\ncleanup_tensor\nClean up a specific tensor file after it’s been used\n\n\nload_tensor\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\nsave_tensor\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\ntrigger_prefetch\nTrigger prefetching of the next N tensors with proper synchronization\n\n\nwait_for_save\nWait for a tensor to be saved to disk\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup()\nClean up all temp files and stop prefetch thread with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.cleanup_tensor(\n    file_path,\n)\nClean up a specific tensor file after it’s been used\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.load_tensor(\n    file_path,\n    target_device='cuda',\n)\nLoad tensor from disk or prefetch cache with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.save_tensor(\n    tensor,\n)\nSave tensor to disk asynchronously and return file path with thread-safe operations\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.trigger_prefetch(\n    n=None,\n)\nTrigger prefetching of the next N tensors with proper synchronization\n\n\n\nmonkeypatch.gradient_checkpointing.offload_disk.DiskOffloadManager.wait_for_save(\n    file_path,\n    timeout=None,\n)\nWait for a tensor to be saved to disk"
+    "text": "Name\nDescription\n\n\n\n\ngeglu_backward\nGEGLU backward pass using in-place operations.\n\n\ngeglu_forward\nGEGLU forward pass.\n\n\n\n\n\nkernels.geglu.geglu_backward(grad_output, gate, up)\nGEGLU backward pass using in-place operations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngrad_output\ntorch.Tensor\nGradient of loss with respect to output, shape [batch, seq_len, hidden_dim].\nrequired\n\n\ngate\ntorch.Tensor\nGate tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor from forward pass, shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[torch.Tensor, torch.Tensor, torch.Tensor]\nTuple containing: - GEGLU activation output (h) - Gradient with respect to gate (grad_gate) - Gradient with respect to up (grad_up)\n\n\n\n\n\n\nThis function modifies its input tensors in-place to store results.\n\n\n\n\nkernels.geglu.geglu_forward(gate, up)\nGEGLU forward pass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ngate\ntorch.Tensor\nInput gate tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\nup\ntorch.Tensor\nUp-projection tensor of shape [batch, seq_len, hidden_dim].\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntorch.Tensor\ntorch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim]."
   },
   {
-    "objectID": "docs/api/index.html",
-    "href": "docs/api/index.html",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.freeze.html",
+    "href": "docs/api/utils.freeze.html",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\n\n\n\n\n\n\nCommand-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\n\nTraining implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers\n\n\n\n\n\n\nFunctionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module\n\n\n\n\n\n\nMixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin\n\n\n\n\n\n\nContext managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities\n\n\n\n\n\n\nPrompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\n\nLow-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules.\n\n\n\n\n\n\nRuntime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching\n\n\n\n\n\n\nUtility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao.\n\n\n\n\n\n\nPydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models\n\n\n\n\n\n\nThird-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\n\nCommon utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities.\n\n\n\n\n\n\nCustom model implementations\n\n\n\nmodels.mamba.modeling_mamba\n\n\n\n\n\n\n\nData processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences\n\n\n\n\n\n\nTraining callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
+    "text": "utils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\n\n\n\nName\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise.\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
   },
   {
-    "objectID": "docs/api/index.html#core",
-    "href": "docs/api/index.html#core",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.freeze.html#classes",
+    "href": "docs/api/utils.freeze.html#classes",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Core functionality for training\n\n\n\ntrain\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\nevaluate\nModule for evaluating models.\n\n\ndatasets\nModule containing Dataset functionality\n\n\nconvert\nModule containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes\n\n\nprompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\nlogging_config\nCommon logging module for axolotl\n\n\ncore.builders.base\nBase class for trainer builder\n\n\ncore.builders.causal\nBuilder for causal trainers\n\n\ncore.builders.rl\nBuilder for RLHF trainers\n\n\ncore.training_args\nextra axolotl specific training args\n\n\ncore.chat.messages\ninternal message representations of chat messages\n\n\ncore.chat.format.chatml\nChatML transformation functions for MessageContents\n\n\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents\n\n\ncore.chat.format.shared\nshared functions for format transforms\n\n\ncore.datasets.chat\nchat dataset module\n\n\ncore.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the"
+    "text": "Name\nDescription\n\n\n\n\nLayerNamePattern\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nutils.freeze.LayerNamePattern(pattern)\nRepresents a regex pattern for layer names, potentially including a parameter index range.\n\n\n\n\n\nName\nDescription\n\n\n\n\nmatch\nChecks if the given layer name matches the regex pattern.\n\n\n\n\n\nutils.freeze.LayerNamePattern.match(name)\nChecks if the given layer name matches the regex pattern.\nParameters:\n- name (str): The layer name to check.\nReturns:\n- bool: True if the layer name matches the pattern, False otherwise."
   },
   {
-    "objectID": "docs/api/index.html#cli",
-    "href": "docs/api/index.html#cli",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.freeze.html#functions",
+    "href": "docs/api/utils.freeze.html#functions",
+    "title": "utils.freeze",
     "section": "",
-    "text": "Command-line interface\n\n\n\ncli.main\nClick CLI definitions for various axolotl commands.\n\n\ncli.train\nCLI to run training on a model.\n\n\ncli.evaluate\nCLI to run evaluation on a model.\n\n\ncli.args\nModule for axolotl CLI command arguments.\n\n\ncli.art\nAxolotl ASCII logo utils.\n\n\ncli.checks\nVarious checks for Axolotl CLI.\n\n\ncli.config\nConfiguration loading and processing.\n\n\ncli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\ncli.inference\nCLI to run inference on a trained model.\n\n\ncli.merge_lora\nCLI to merge a trained LoRA into a base model.\n\n\ncli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\ncli.preprocess\nCLI to run preprocessing of a dataset.\n\n\ncli.quantize\nCLI to post-training quantize a model using torchao\n\n\ncli.vllm_serve\nCLI to start the vllm server for online RL\n\n\ncli.cloud.base\nbase class for cloud platforms from cli\n\n\ncli.cloud.modal_\nModal Cloud support from CLI\n\n\ncli.utils\nInit for axolotl.cli.utils module.\n\n\ncli.utils.args\nUtilities for axolotl CLI args.\n\n\ncli.utils.fetch\nUtilities for axolotl fetch CLI command.\n\n\ncli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\ncli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\ncli.utils.train\nUtilities for axolotl train CLI command."
+    "text": "Name\nDescription\n\n\n\n\nfreeze_layers_except\nFreezes all layers of the given model except for the layers that match given regex patterns.\n\n\n\n\n\nutils.freeze.freeze_layers_except(model, regex_patterns)\nFreezes all layers of the given model except for the layers that match given regex patterns.\nPeriods in the patterns are treated as literal periods, not as wildcard characters.\nParameters:\n- model (nn.Module): The PyTorch model to be modified.\n- regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.\nNote that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.\nAlso, to match the entire layer name, the pattern should start with “^” and end with “\\(\", otherwise it will match any part of the layer name.\n  The range pattern part is optional and it is not compiled as a regex pattern which means you must put \"\\)” before the range pattern if you want to match the entire layer name.\nE.g., [“^model.embed_tokens.weight\\([:32000]\", \"layers.2[0-9]+.block_sparse_moe.gate.[a-z]+\\)”]\nReturns:\nNone; the model is modified in place."
   },
   {
-    "objectID": "docs/api/index.html#trainers",
-    "href": "docs/api/index.html#trainers",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html",
+    "href": "docs/api/utils.callbacks.mlflow_.html",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "Training implementations\n\n\n\ncore.trainers.base\nModule for customized trainers\n\n\ncore.trainers.trl\nModule for TRL RL trainers\n\n\ncore.trainers.mamba\nModule for mamba trainer\n\n\ncore.trainers.dpo.trainer\nDPO trainer for axolotl\n\n\ncore.trainers.grpo.trainer\nAxolotl GRPO trainers (with and without sequence parallelism handling)\n\n\ncore.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\n\n\ncore.trainers.utils\nUtils for Axolotl trainers"
+    "text": "utils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/index.html#model-loading",
-    "href": "docs/api/index.html#model-loading",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "href": "docs/api/utils.callbacks.mlflow_.html#classes",
+    "title": "utils.callbacks.mlflow_",
     "section": "",
-    "text": "Functionality for loading and patching models, tokenizers, etc.\n\n\n\nloaders.model\nModel loader class implementation for loading, configuring, and patching various models.\n\n\nloaders.tokenizer\nTokenizer loading functionality and associated utils\n\n\nloaders.processor\nProcessor loading functionality for multi-modal models\n\n\nloaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\nloaders.patch_manager\nPatch manager class implementation to complement axolotl.loaders.ModelLoader.\n\n\nloaders.constants\nShared constants for axolotl.loaders module"
+    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoMlflowCallback\nCallback to save axolotl config to mlflow\n\n\n\n\n\nutils.callbacks.mlflow_.SaveAxolotlConfigtoMlflowCallback(axolotl_config_path)\nCallback to save axolotl config to mlflow"
   },
   {
-    "objectID": "docs/api/index.html#mixins",
-    "href": "docs/api/index.html#mixins",
-    "title": "API Reference",
+    "objectID": "docs/api/core.trainers.grpo.sampler.html",
+    "href": "docs/api/core.trainers.grpo.sampler.html",
+    "title": "core.trainers.grpo.sampler",
     "section": "",
-    "text": "Mixin classes for augmenting trainers\n\n\n\ncore.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\ncore.trainers.mixins.rng_state_loader\nTemporary fix/override for bug in resume from checkpoint\n\n\ncore.trainers.mixins.scheduler\nModule for Axolotl trainer scheduler mixin"
+    "text": "core.trainers.grpo.sampler\nRepeat random sampler (similar to the one implemented in\nhttps://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds\nsequence parallelism functionality; i.e., duplicating data across ranks in the same\nsequence parallel group.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
   },
   {
-    "objectID": "docs/api/index.html#context-managers",
-    "href": "docs/api/index.html#context-managers",
-    "title": "API Reference",
+    "objectID": "docs/api/core.trainers.grpo.sampler.html#classes",
+    "href": "docs/api/core.trainers.grpo.sampler.html#classes",
+    "title": "core.trainers.grpo.sampler",
     "section": "",
-    "text": "Context managers for altering trainer behaviors\n\n\n\nutils.ctx_managers.sequence_parallel\nModule for Axolotl trainer sequence parallelism manager and utilities"
+    "text": "Name\nDescription\n\n\n\n\nSequenceParallelRepeatRandomSampler\nSampler for GRPO training with sequence parallelism.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler(\n    dataset,\n    mini_repeat_count,\n    world_size,\n    rank,\n    batch_size=1,\n    repeat_count=1,\n    context_parallel_size=1,\n    shuffle=True,\n    seed=0,\n    drop_last=False,\n)\nSampler for GRPO training with sequence parallelism.\nThis sampler ensures:\n- Ranks in the same sequence parallel (SP) group receive identical data.\n- Each index is repeated multiple times for sampling different completions.\n- Entire batches are repeated for reuse in multiple updates.\n- Data is properly distributed across SP groups.\nIn the table below, the values represent dataset indices. Each SP group has\ncontext_parallel_size = 2 GPUs working together on the same data. There are 2\nSP groups (SP0 and SP1), with world_size = 4 total GPUs.\n                                       Sequence Parallel Groups\n                                |       SP0        |       SP1        |\n                                |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |\n            global_step  step    &lt;---&gt; mini_repeat_count=3\n                                    &lt;----------&gt; batch_size=2 per SP group\ngrad_accum=2 ▲ ▲ 0 0 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- SP groups get different data\n▼ | 0 1 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Same data for each SP group GPU\n|\n| 1 2 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- Repeat same indices for iterations\nnum_iterations=2 ▼ 1 3 [0 0 0 1 1 1] [2 2 2 3 3 3] &lt;- When using gradient accumulation\n                 2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   &lt;- New batch of data indices\n                 2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]\n                                    ...\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ndataset\nSized\nDataset to sample from.\nrequired\n\n\nmini_repeat_count\nint\nHow many times to repeat each sample immediately.\nrequired\n\n\nworld_size\nint\nTotal number of processes.\nrequired\n\n\nrank\nint\nRank of current process.\nrequired\n\n\nbatch_size\nint\nNumber of samples per batch.\n1\n\n\nrepeat_count\nint\nHow many times to repeat the full sampling process.\n1\n\n\ncontext_parallel_size\nint\nNumber of ranks in a sequence parallel group.\n1\n\n\nshuffle\nbool\nWhether to shuffle the dataset.\nTrue\n\n\nseed\nint\nRandom seed for shuffling.\n0\n\n\ndrop_last\nbool\nWhether to drop the last incomplete batch.\nFalse\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nset_epoch\nSets the epoch for this sampler.\n\n\n\n\n\ncore.trainers.grpo.sampler.SequenceParallelRepeatRandomSampler.set_epoch(epoch)\nSets the epoch for this sampler.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nepoch\nint\nEpoch number to use for shuffling.\nrequired"
   },
   {
-    "objectID": "docs/api/index.html#prompt-strategies",
-    "href": "docs/api/index.html#prompt-strategies",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.utils.train.html",
+    "href": "docs/api/cli.utils.train.html",
+    "title": "cli.utils.train",
     "section": "",
-    "text": "Prompt formatting strategies\n\n\n\nprompt_strategies.base\nmodule for base dataset transform strategies\n\n\nprompt_strategies.chat_template\nHF Chat Templates prompt strategy\n\n\nprompt_strategies.alpaca_chat\nModule for Alpaca prompt strategy classes\n\n\nprompt_strategies.alpaca_instruct\nModule loading the AlpacaInstructPromptTokenizingStrategy class\n\n\nprompt_strategies.alpaca_w_system\nPrompt strategies loader for alpaca instruction datasets with system prompts\n\n\nprompt_strategies.user_defined\nUser Defined prompts with configuration from the YML config\n\n\nprompt_strategies.llama2_chat\nPrompt Strategy for finetuning Llama2 chat models\n\n\nprompt_strategies.completion\nBasic completion text\n\n\nprompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\nprompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\n\n\nprompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\nprompt_strategies.orcamini\nPrompt Strategy for finetuning Orca Mini (v2) models\n\n\nprompt_strategies.pygmalion\nModule containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class\n\n\nprompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\nprompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\nprompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\nprompt_strategies.dpo.zephyr\nDPO strategies for zephyr\n\n\nprompt_strategies.dpo.user_defined\nUser-defined DPO strategies\n\n\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy\n\n\nprompt_strategies.kto.llama3\nKTO strategies for llama-3 chat template\n\n\nprompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies\n\n\nprompt_strategies.orpo.chat_template\nchatml prompt tokenization strategy for ORPO\n\n\nprompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template"
+    "text": "cli.utils.train\nUtilities for axolotl train CLI command.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
   },
   {
-    "objectID": "docs/api/index.html#kernels",
-    "href": "docs/api/index.html#kernels",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.utils.train.html#functions",
+    "href": "docs/api/cli.utils.train.html#functions",
+    "title": "cli.utils.train",
     "section": "",
-    "text": "Low-level performance optimizations\n\n\n\nkernels.lora\nModule for definition of Low-Rank Adaptation (LoRA) Triton kernels.\n\n\nkernels.geglu\nModule for definition of GEGLU Triton kernels.\n\n\nkernels.swiglu\nModule for definition of SwiGLU Triton kernels.\n\n\nkernels.quantize\nDequantization utilities for bitsandbytes integration.\n\n\nkernels.utils\nUtilities for axolotl.kernels submodules."
+    "text": "Name\nDescription\n\n\n\n\nbuild_command\nBuild command list from base command and options.\n\n\ngenerate_config_files\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\n\n\nlaunch_training\nExecute training with the given configuration.\n\n\n\n\n\ncli.utils.train.build_command(base_cmd, options)\nBuild command list from base command and options.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_cmd\nlist[str]\nCommand without options.\nrequired\n\n\noptions\ndict[str, Any]\nOptions to parse and append to base command.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nList of strings giving shell command.\n\n\n\n\n\n\n\ncli.utils.train.generate_config_files(config, sweep)\nGenerate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating\nwhether this is a group of configurations (i.e., a sweep).\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nstr\nBase configuration file\nrequired\n\n\nsweep\nstr | None\nSweep configuration file\nrequired\n\n\n\n\n\n\n\ncli.utils.train.launch_training(\n    cfg_file,\n    launcher,\n    cloud,\n    kwargs,\n    launcher_args=None,\n    use_exec=False,\n)\nExecute training with the given configuration."
   },
   {
-    "objectID": "docs/api/index.html#monkey-patches",
-    "href": "docs/api/index.html#monkey-patches",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.utils.load.html",
+    "href": "docs/api/cli.utils.load.html",
+    "title": "cli.utils.load",
     "section": "",
-    "text": "Runtime patches for model optimizations\n\n\n\nmonkeypatch.llama_attn_hijack_flash\nFlash attention monkey patch for llama model\n\n\nmonkeypatch.llama_attn_hijack_xformers\nDirectly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments\n\n\nmonkeypatch.mistral_attn_hijack_flash\nFlash attention monkey patch for mistral model\n\n\nmonkeypatch.multipack\nmultipack patching for v2 of sample packing\n\n\nmonkeypatch.relora\nImplements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune.\n\n\nmonkeypatch.llama_expand_mask\nexpands the binary attention mask per 3.2.2 of https://arxiv.org/pdf/2107.02027.pdf\n\n\nmonkeypatch.lora_kernels\nModule for patching custom LoRA Triton kernels and torch.autograd functions.\n\n\nmonkeypatch.utils\nShared utils for the monkeypatches\n\n\nmonkeypatch.btlm_attn_hijack_flash\nFlash attention monkey patch for cerebras btlm model\n\n\nmonkeypatch.llama_patch_multipack\nPatched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention\n\n\nmonkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\nmonkeypatch.trainer_fsdp_optim\nfix for FSDP optimizer save in trainer w 4.47.0\n\n\nmonkeypatch.transformers_fa_utils\nsee https://github.com/huggingface/transformers/pull/35834\n\n\nmonkeypatch.unsloth_\nmodule for patching with unsloth optimizations\n\n\nmonkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\nmonkeypatch.mixtral\nPatches to support multipack for mixtral\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\nmonkeypatch.gradient_checkpointing.offload_disk\nDISCO - DIsk-based Storage and Checkpointing with Optimized prefetching"
+    "text": "cli.utils.load\nUtilities for model, tokenizer, etc. loading.\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
   },
   {
-    "objectID": "docs/api/index.html#utils",
-    "href": "docs/api/index.html#utils",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.utils.load.html#functions",
+    "href": "docs/api/cli.utils.load.html#functions",
+    "title": "cli.utils.load",
     "section": "",
-    "text": "Utility functions\n\n\n\nutils.tokenization\nModule for tokenization utilities\n\n\nutils.chat_templates\nThis module provides functionality for selecting chat templates based on user choices.\n\n\nutils.lora\nmodule to get the state dict of a merged lora model\n\n\nutils.model_shard_quant\nmodule to handle loading model on cpu/meta device for FSDP\n\n\nutils.bench\nBenchmarking and measurement utilities\n\n\nutils.freeze\nmodule to freeze/unfreeze parameters by name\n\n\nutils.trainer\nModule containing the Trainer class and related functions\n\n\nutils.schedulers\nModule for custom LRScheduler class\n\n\nutils.distributed\nUtilities for distributed functionality.\n\n\nutils.dict\nModule containing the DictDefault class\n\n\nutils.optimizers.adopt\nCopied from https://github.com/iShohei220/adopt\n\n\nutils.data.pretraining\ndata handling specific to pretraining\n\n\nutils.data.sft\nData handling specific to SFT.\n\n\nutils.quantization\nUtilities for quantization including QAT and PTQ using torchao."
+    "text": "Name\nDescription\n\n\n\n\nload_model_and_tokenizer\nHelper function for loading a model, tokenizer, and processor specified in the\n\n\n\n\n\ncli.utils.load.load_model_and_tokenizer(cfg, inference=False)\nHelper function for loading a model, tokenizer, and processor specified in the\ngiven axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ninference\nbool\nBoolean denoting inference mode.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any, ProcessorMixin | None]\nTuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin)."
   },
   {
-    "objectID": "docs/api/index.html#schemas",
-    "href": "docs/api/index.html#schemas",
-    "title": "API Reference",
+    "objectID": "docs/api/utils.collators.core.html",
+    "href": "docs/api/utils.collators.core.html",
+    "title": "utils.collators.core",
     "section": "",
-    "text": "Pydantic data models for Axolotl config\n\n\n\nutils.schemas.config\nModule with Pydantic models for configuration.\n\n\nutils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\nutils.schemas.training\nPydantic models for training hyperparameters\n\n\nutils.schemas.datasets\nPydantic models for datasets-related configuration\n\n\nutils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\nutils.schemas.trl\nPydantic models for TRL trainer configuration\n\n\nutils.schemas.multimodal\nPydantic models for multimodal-related configuration\n\n\nutils.schemas.integrations\nPydantic models for Axolotl integrations\n\n\nutils.schemas.enums\nEnums for Axolotl input config\n\n\nutils.schemas.utils\nUtilities for Axolotl Pydantic models"
+    "text": "utils.collators.core\nutils.collators.core\nbasic shared collator constants"
   },
   {
-    "objectID": "docs/api/index.html#integrations",
-    "href": "docs/api/index.html#integrations",
-    "title": "API Reference",
+    "objectID": "docs/api/integrations.liger.args.html",
+    "href": "docs/api/integrations.liger.args.html",
+    "title": "integrations.liger.args",
     "section": "",
-    "text": "Third-party integrations and extensions\n\n\n\nintegrations.base\nBase class for all plugins.\n\n\nintegrations.cut_cross_entropy.args\nModule for handling Cut Cross Entropy input arguments.\n\n\nintegrations.grokfast.optimizer\n\n\n\nintegrations.kd.trainer\nKD trainer\n\n\nintegrations.liger.args\nModule for handling LIGER input arguments.\n\n\nintegrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\nintegrations.spectrum.args\nModule for handling Spectrum input arguments."
+    "text": "integrations.liger.args\nModule for handling LIGER input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
   },
   {
-    "objectID": "docs/api/index.html#common",
-    "href": "docs/api/index.html#common",
-    "title": "API Reference",
+    "objectID": "docs/api/integrations.liger.args.html#classes",
+    "href": "docs/api/integrations.liger.args.html#classes",
+    "title": "integrations.liger.args",
     "section": "",
-    "text": "Common utilities and shared functionality\n\n\n\ncommon.architectures\nCommon architecture specific constants\n\n\ncommon.const\nVarious shared constants\n\n\ncommon.datasets\nDataset loading utilities."
+    "text": "Name\nDescription\n\n\n\n\nLigerArgs\nInput args for LIGER.\n\n\n\n\n\nintegrations.liger.args.LigerArgs()\nInput args for LIGER."
   },
   {
-    "objectID": "docs/api/index.html#models",
-    "href": "docs/api/index.html#models",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.preprocess.html",
+    "href": "docs/api/cli.preprocess.html",
+    "title": "cli.preprocess",
     "section": "",
-    "text": "Custom model implementations\n\n\n\nmodels.mamba.modeling_mamba"
+    "text": "cli.preprocess\nCLI to run preprocessing of a dataset.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/index.html#data-processing",
-    "href": "docs/api/index.html#data-processing",
-    "title": "API Reference",
+    "objectID": "docs/api/cli.preprocess.html#functions",
+    "href": "docs/api/cli.preprocess.html#functions",
+    "title": "cli.preprocess",
     "section": "",
-    "text": "Data processing utilities\n\n\n\nutils.collators.core\nbasic shared collator constants\n\n\nutils.collators.batching\nData collators for axolotl to pad labels and position_ids for packed sequences\n\n\nutils.collators.mamba\ncollators for Mamba\n\n\nutils.collators.mm_chat\nCollators for multi-modal chat messages and packing\n\n\nutils.samplers.multipack\nMultipack Batch Sampler - An efficient batch sampler for packing variable-length sequences"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\ndo_preprocess\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\ncli.preprocess.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_preprocess.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.preprocess.do_preprocess(cfg, cli_args)\nPreprocesses dataset specified in axolotl config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs\nPreprocessing-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/index.html#callbacks",
-    "href": "docs/api/index.html#callbacks",
-    "title": "API Reference",
+    "objectID": "docs/api/kernels.utils.html",
+    "href": "docs/api/kernels.utils.html",
+    "title": "kernels.utils",
     "section": "",
-    "text": "Training callbacks\n\n\n\nutils.callbacks.perplexity\ncallback to calculate perplexity as an evaluation metric.\n\n\nutils.callbacks.profiler\nHF Trainer callback for creating pytorch profiling snapshots\n\n\nutils.callbacks.lisa\nmodule for LISA\n\n\nutils.callbacks.mlflow_\nMLFlow module for trainer callbacks\n\n\nutils.callbacks.comet_\nComet module for trainer callbacks\n\n\nutils.callbacks.qat\nQAT Callback for HF Causal Trainer"
+    "text": "kernels.utils\nkernels.utils\nUtilities for axolotl.kernels submodules."
   },
   {
-    "objectID": "docs/api/integrations.base.html",
-    "href": "docs/api/integrations.base.html",
-    "title": "integrations.base",
+    "objectID": "docs/api/utils.callbacks.lisa.html",
+    "href": "docs/api/utils.callbacks.lisa.html",
+    "title": "utils.callbacks.lisa",
     "section": "",
-    "text": "integrations.base\nBase class for all plugins.\nA plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.\nPlugins can be used to integrate third-party models, modify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported.\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "utils.callbacks.lisa\nutils.callbacks.lisa\nmodule for LISA\nAdapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl\nArxiv: https://arxiv.org/abs/2403.17919\nLicense: Apache 2.0"
   },
   {
-    "objectID": "docs/api/integrations.base.html#classes",
-    "href": "docs/api/integrations.base.html#classes",
-    "title": "integrations.base",
+    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html",
+    "title": "monkeypatch.data.batch_dataset_fetcher",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBaseOptimizerFactory\nBase class for factories to create custom optimizers\n\n\nBasePlugin\nBase class for all plugins. Defines the interface for plugin methods.\n\n\nPluginManager\nThe PluginManager class is responsible for loading and managing plugins. It\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory()\nBase class for factories to create custom optimizers\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_decay_parameter_names\nGet all parameter names that weight decay will be applied to.\n\n\n\n\n\nintegrations.base.BaseOptimizerFactory.get_decay_parameter_names(model)\nGet all parameter names that weight decay will be applied to.\nThis function filters out parameters in two ways:\n1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)\n2. By parameter name patterns (containing ‘bias’, or variation of ‘norm’)\n\n\n\n\n\nintegrations.base.BasePlugin()\nBase class for all plugins. Defines the interface for plugin methods.\nA plugin is a reusable, modular, and self-contained piece of code that extends\nthe functionality of Axolotl. Plugins can be used to integrate third-party models,\nmodify the training process, or add new features.\nTo create a new plugin, you need to inherit from the BasePlugin class and\nimplement the required methods.\n\n\nPlugin methods include:\n- register(cfg): Registers the plugin with the given configuration.\n- load_datasets(cfg): Loads and preprocesses the dataset for training.\n- pre_model_load(cfg): Performs actions before the model is loaded.\n- post_model_build(cfg, model): Performs actions after the model is loaded, but\nbefore LoRA adapters are applied.\n- pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.\n- post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.\n- post_model_load(cfg, model): Performs actions after the model is loaded,\ninclusive of any adapters.\n- post_trainer_create(cfg, trainer): Performs actions after the trainer is\ncreated.\n- create_optimizer(cfg, trainer): Creates and returns an optimizer for training.\n- create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and\nreturns a learning rate scheduler.\n- add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before\ntraining.\n- add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after\ntraining.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nAdds callbacks to the trainer after creating the trainer. This is useful for\n\n\nadd_callbacks_pre_trainer\nSet up callbacks before creating the trainer.\n\n\ncreate_lr_scheduler\nCreates and returns a learning rate scheduler.\n\n\ncreate_optimizer\nCreates and returns an optimizer for training.\n\n\nget_collator_cls_and_kwargs\nReturns a custom class for the collator.\n\n\nget_input_args\nReturns a pydantic model for the plugin’s input arguments.\n\n\nget_trainer_cls\nReturns a custom class for the trainer.\n\n\nget_training_args\nReturns custom training arguments to set on TrainingArgs.\n\n\nget_training_args_mixin\nReturns a dataclass model for the plugin’s training arguments.\n\n\nload_datasets\nLoads and preprocesses the dataset for training.\n\n\npost_lora_load\nPerforms actions after LoRA weights are loaded.\n\n\npost_model_build\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\npost_model_load\nPerforms actions after the model is loaded.\n\n\npost_train\nPerforms actions after training is complete.\n\n\npost_train_unload\nPerforms actions after training is complete and the model is unloaded.\n\n\npost_trainer_create\nPerforms actions after the trainer is created.\n\n\npre_lora_load\nPerforms actions before LoRA weights are loaded.\n\n\npre_model_load\nPerforms actions before the model is loaded.\n\n\nregister\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_post_trainer(cfg, trainer)\nAdds callbacks to the trainer after creating the trainer. This is useful for\ncallbacks that require access to the model or trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.add_callbacks_pre_trainer(cfg, model)\nSet up callbacks before creating the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_lr_scheduler(\n    cfg,\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCreates and returns a learning rate scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\nnum_training_steps\nint\nTotal number of training steps\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.create_optimizer(cfg, trainer)\nCreates and returns an optimizer for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_collator_cls_and_kwargs(cfg, is_eval=False)\nReturns a custom class for the collator.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\nis_eval\nbool\nWhether this is an eval split.\nFalse\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nclass\n\nThe class for the collator.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_input_args()\nReturns a pydantic model for the plugin’s input arguments.\n\n\n\nintegrations.base.BasePlugin.get_trainer_cls(cfg)\nReturns a custom class for the trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args(cfg)\nReturns custom training arguments to set on TrainingArgs.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe global axolotl configuration.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nobject\n\ndict containing the training arguments.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.get_training_args_mixin()\nReturns a dataclass model for the plugin’s training arguments.\n\n\n\nintegrations.base.BasePlugin.load_datasets(cfg, preprocess=False)\nLoads and preprocesses the dataset for training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\npreprocess\nbool\nWhether this is the preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ndataset_meta\nUnion['TrainDatasetMeta', None]\nThe metadata for the training dataset.\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_lora_load(cfg, model)\nPerforms actions after LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_build(cfg, model)\nPerforms actions after the model is built/loaded, but before any adapters are applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_model_load(cfg, model)\nPerforms actions after the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train(cfg, model)\nPerforms actions after training is complete.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe axolotl configuration.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_train_unload(cfg)\nPerforms actions after training is complete and the model is unloaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.post_trainer_create(cfg, trainer)\nPerforms actions after the trainer is created.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_lora_load(cfg, model)\nPerforms actions before LoRA weights are loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.pre_model_load(cfg)\nPerforms actions before the model is loaded.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\nintegrations.base.BasePlugin.register(cfg)\nRegisters the plugin with the given configuration as an unparsed dict.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\ndict\nThe configuration for the plugin.\nrequired\n\n\n\n\n\n\n\n\n\nintegrations.base.PluginManager()\nThe PluginManager class is responsible for loading and managing plugins. It\nshould be a singleton so it can be accessed from anywhere in the codebase.\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nplugins\nOrderedDict[str, BasePlugin]\nA list of loaded plugins.\n\n\n\n\n\n\nKey methods include:\n- get_instance(): Static method to get the singleton instance of PluginManager.\n- register(plugin_name: str): Registers a new plugin by its name.\n- pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_callbacks_post_trainer\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\nadd_callbacks_pre_trainer\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\ncreate_lr_scheduler\nCalls the create_lr_scheduler method of all registered plugins and returns\n\n\ncreate_optimizer\nCalls the create_optimizer method of all registered plugins and returns\n\n\nget_collator_cls_and_kwargs\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\n\n\nget_input_args\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\nget_instance\nReturns the singleton instance of PluginManager. If the instance doesn’t\n\n\nget_trainer_cls\nCalls the get_trainer_cls method of all registered plugins and returns the\n\n\nget_training_args\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\n\n\nget_training_args_mixin\nReturns a list of dataclasses for all registered plugins’ training args mixins’\n\n\nload_datasets\nCalls the load_datasets method of each registered plugin.\n\n\npost_lora_load\nCalls the post_lora_load method of all registered plugins.\n\n\npost_model_build\nCalls the post_model_build method of all registered plugins after the\n\n\npost_model_load\nCalls the post_model_load method of all registered plugins after the model\n\n\npost_train\nCalls the post_train method of all registered plugins.\n\n\npost_train_unload\nCalls the post_train_unload method of all registered plugins.\n\n\npost_trainer_create\nCalls the post_trainer_create method of all registered plugins.\n\n\npre_lora_load\nCalls the pre_lora_load method of all registered plugins.\n\n\npre_model_load\nCalls the pre_model_load method of all registered plugins.\n\n\nregister\nRegisters a new plugin by its name.\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_post_trainer(cfg, trainer)\nCalls the add_callbacks_post_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.add_callbacks_pre_trainer(cfg, model)\nCalls the add_callbacks_pre_trainer method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[Callable]\nA list of callback functions to be added to the TrainingArgs.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_lr_scheduler(\n    trainer,\n    optimizer,\n    num_training_steps,\n)\nCalls the create_lr_scheduler method of all registered plugins and returns\nthe first non-None scheduler.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\noptimizer\nOptimizer\nThe optimizer for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nLRScheduler | None\nThe created learning rate scheduler, or None if not found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.create_optimizer(trainer)\nCalls the create_optimizer method of all registered plugins and returns\nthe first non-None optimizer.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptimizer | None\nThe created optimizer, or None if none was found.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_collator_cls_and_kwargs(cfg, is_eval=False)\nCalls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.\nParameters:\ncfg (dict): The configuration for the plugins.\nis_eval (bool): Whether this is an eval split.\nReturns:\nobject: The collator class, or None if none was found.\n\n\n\nintegrations.base.PluginManager.get_input_args()\nReturns a list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nlist[str]\nA list of Pydantic classes for all registered plugins’ input arguments.’\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_instance()\nReturns the singleton instance of PluginManager. If the instance doesn’t\nexist, it creates a new one.\n\n\n\nintegrations.base.PluginManager.get_trainer_cls(cfg)\nCalls the get_trainer_cls method of all registered plugins and returns the\nfirst non-None trainer class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainer | None\nThe first non-None trainer class returned by a plugin.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.get_training_args(cfg)\nCalls the get_training_args method of all registered plugins and returns the combined training arguments.\nParameters:\ncfg (dict): The configuration for the plugins.\nReturns:\nobject: The training arguments\n\n\n\nintegrations.base.PluginManager.get_training_args_mixin()\nReturns a list of dataclasses for all registered plugins’ training args mixins’\nReturns:\nlist[str]: A list of dataclsses\n\n\n\nintegrations.base.PluginManager.load_datasets(cfg, preprocess=False)\nCalls the load_datasets method of each registered plugin.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\npreprocess\nbool\nWhether this is preprocess step of the datasets.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nUnion['TrainDatasetMeta', None]\nThe dataset metadata loaded from all registered plugins.\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_lora_load(cfg, model)\nCalls the post_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_build(cfg, model)\nCalls the post_model_build method of all registered plugins after the\nmodel has been built / loaded, but before any adapters have been applied.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_model_load(cfg, model)\nCalls the post_model_load method of all registered plugins after the model\nhas been loaded inclusive of any adapters.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train(cfg, model)\nCalls the post_train method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel | PeftModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_train_unload(cfg)\nCalls the post_train_unload method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.post_trainer_create(cfg, trainer)\nCalls the post_trainer_create method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object for training.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_lora_load(cfg, model)\nCalls the pre_lora_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\nmodel\nPreTrainedModel\nThe loaded model.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.pre_model_load(cfg)\nCalls the pre_model_load method of all registered plugins.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration for the plugins.\nrequired\n\n\n\n\n\n\n\nintegrations.base.PluginManager.register(plugin_name)\nRegisters a new plugin by its name.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be registered.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "monkeypatch.data.batch_dataset_fetcher\nMonkey patches for the dataset fetcher to handle batches of packed indexes.\n\n\n\n\n\nName\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
   },
   {
-    "objectID": "docs/api/integrations.base.html#functions",
-    "href": "docs/api/integrations.base.html#functions",
-    "title": "integrations.base",
+    "objectID": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
+    "href": "docs/api/monkeypatch.data.batch_dataset_fetcher.html#functions",
+    "title": "monkeypatch.data.batch_dataset_fetcher",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nload_plugin\nLoads a plugin based on the given plugin name.\n\n\n\n\n\nintegrations.base.load_plugin(plugin_name)\nLoads a plugin based on the given plugin name.\nThe plugin name should be in the format “module_name.class_name”. This function\nsplits the plugin name into module and class, imports the module, retrieves the\nclass from the module, and creates an instance of the class.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nplugin_name\nstr\nThe name of the plugin to be loaded. The name should be in the format “module_name.class_name”.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nBasePlugin\nAn instance of the loaded plugin.\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nImportError\nIf the plugin module cannot be imported."
+    "text": "Name\nDescription\n\n\n\n\napply_multipack_dataloader_patch\nThis patch allows DataLoader to correctly process batches that contain multiple bins\n\n\npatch_fetchers\nApply patches to PyTorch’s DataLoader components.\n\n\npatched_worker_loop\nWorker loop that ensures patches are applied in worker processes.\n\n\nremove_multipack_dataloader_patch\nRemove the monkeypatch and restore original PyTorch DataLoader behavior.\n\n\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.apply_multipack_dataloader_patch()\nThis patch allows DataLoader to correctly process batches that contain multiple bins\nof packed sequences.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patch_fetchers()\nApply patches to PyTorch’s DataLoader components.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.patched_worker_loop(*args, **kwargs)\nWorker loop that ensures patches are applied in worker processes.\n\n\n\nmonkeypatch.data.batch_dataset_fetcher.remove_multipack_dataloader_patch()\nRemove the monkeypatch and restore original PyTorch DataLoader behavior."
   },
   {
-    "objectID": "docs/api/utils.distributed.html",
-    "href": "docs/api/utils.distributed.html",
-    "title": "utils.distributed",
+    "objectID": "docs/api/core.trainers.mixins.optimizer.html",
+    "href": "docs/api/core.trainers.mixins.optimizer.html",
+    "title": "core.trainers.mixins.optimizer",
     "section": "",
-    "text": "utils.distributed\nUtilities for distributed functionality.\n\n\n\n\n\nName\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
+    "text": "core.trainers.mixins.optimizer\nModule for Axolotl trainer optimizer mixin\n\n\n\n\n\nName\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
   },
   {
-    "objectID": "docs/api/utils.distributed.html#functions",
-    "href": "docs/api/utils.distributed.html#functions",
-    "title": "utils.distributed",
+    "objectID": "docs/api/core.trainers.mixins.optimizer.html#classes",
+    "href": "docs/api/core.trainers.mixins.optimizer.html#classes",
+    "title": "core.trainers.mixins.optimizer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nbarrier\nActs as a barrier to wait for all processes. This ensures that all processes\n\n\ncleanup_distributed\nDestroy process group if torch distributed is initialized. Called in training early\n\n\ncompute_and_broadcast\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\n\n\ngather_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\ngather_scalar_from_all_ranks\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\n\n\nis_distributed\nCheck if distributed training is initialized.\n\n\nis_main_process\nCheck if the current process is the main process. If not in distributed mode,\n\n\nreduce_and_broadcast\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\n\n\nzero_first\nruns the wrapped context so that rank 0 runs first before other ranks\n\n\n\n\n\nutils.distributed.barrier()\nActs as a barrier to wait for all processes. This ensures that all processes\nreach the barrier before proceeding further.\n\n\n\nutils.distributed.cleanup_distributed()\nDestroy process group if torch distributed is initialized. Called in training early\ntermination or when training successfully completes.\n\n\n\nutils.distributed.compute_and_broadcast(fn)\nCompute a value using the function ‘fn’ only on the specified rank (default is 0).\nThe value is then broadcasted to all other ranks.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that computes the value. Default is 0.\nReturns:\n- The computed value (int or float).\n\n\n\nutils.distributed.gather_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.gather_scalar_from_all_ranks(fn, world_size=1)\nRun a callable ‘fn’ on all ranks and gather the results on the specified rank.\nArgs:\n- fn (callable): A function that computes the value. This should not have any side effects.\n- rank (int, optional): The rank that gathers the values. Default is 0.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- A list of computed values from all ranks if on the gathering rank, otherwise None.\n\n\n\nutils.distributed.is_distributed()\nCheck if distributed training is initialized.\n\n\n\nutils.distributed.is_main_process()\nCheck if the current process is the main process. If not in distributed mode,\nalways return True.\nWe use a simpler logic when the distributed state is not initialized: we just log\non the 0-th local rank.\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nbool\nTrue if the current process is the main process, False otherwise.\n\n\n\n\n\n\n\nutils.distributed.reduce_and_broadcast(fn1, fn2)\nRun a callable ‘fn1’ on all ranks, gather the results, reduce them using ‘fn2’,\nand then broadcast the reduced result to all ranks.\nArgs:\n- fn1 (callable): A function that computes the value on each rank.\n- fn2 (callable): A reduction function that takes a list of values and returns a single value.\n- world_size (int, optional): Total number of processes in the current distributed setup.\nReturns:\n- The reduced and broadcasted value.\n\n\n\nutils.distributed.zero_first(is_main)\nruns the wrapped context so that rank 0 runs first before other ranks"
+    "text": "Name\nDescription\n\n\n\n\nOptimizerInitMixin\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\n\n\nOptimizerMixin\nMixin class for shared handling of building custom optimizers\n\n\n\n\n\ncore.trainers.mixins.optimizer.OptimizerInitMixin(*args, **kwargs)\nMixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not\naccept optimizer_cls_and_kwargs as kwarg in constructor.\n\n\n\ncore.trainers.mixins.optimizer.OptimizerMixin()\nMixin class for shared handling of building custom optimizers"
   },
   {
-    "objectID": "docs/api/core.datasets.chat.html",
-    "href": "docs/api/core.datasets.chat.html",
-    "title": "core.datasets.chat",
+    "objectID": "docs/api/loaders.processor.html",
+    "href": "docs/api/loaders.processor.html",
+    "title": "loaders.processor",
     "section": "",
-    "text": "core.datasets.chat\nchat dataset module\n\n\n\n\n\nName\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+    "text": "loaders.processor\nloaders.processor\nProcessor loading functionality for multi-modal models"
   },
   {
-    "objectID": "docs/api/core.datasets.chat.html#classes",
-    "href": "docs/api/core.datasets.chat.html#classes",
-    "title": "core.datasets.chat",
+    "objectID": "docs/api/core.trainers.utils.html",
+    "href": "docs/api/core.trainers.utils.html",
+    "title": "core.trainers.utils",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTokenizedChatDataset\nTokenized chat dataset\n\n\n\n\n\ncore.datasets.chat.TokenizedChatDataset(\n    data,\n    model_transform,\n    *args,\n    message_transform=None,\n    formatter=None,\n    process_count=None,\n    keep_in_memory=False,\n    **kwargs,\n)\nTokenized chat dataset"
+    "text": "core.trainers.utils\ncore.trainers.utils\nUtils for Axolotl trainers"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "href": "docs/api/prompt_strategies.dpo.passthrough.html",
-    "title": "prompt_strategies.dpo.passthrough",
+    "objectID": "docs/api/prompt_strategies.dpo.chat_template.html",
+    "href": "docs/api/prompt_strategies.dpo.chat_template.html",
+    "title": "prompt_strategies.dpo.chat_template",
     "section": "",
-    "text": "prompt_strategies.dpo.passthrough\nprompt_strategies.dpo.passthrough\nDPO prompt strategies passthrough/zero-processing strategy"
+    "text": "prompt_strategies.dpo.chat_template\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates."
   },
   {
-    "objectID": "docs/api/prompt_strategies.metharme.html",
-    "href": "docs/api/prompt_strategies.metharme.html",
-    "title": "prompt_strategies.metharme",
+    "objectID": "docs/api/prompt_tokenizers.html",
+    "href": "docs/api/prompt_tokenizers.html",
+    "title": "prompt_tokenizers",
     "section": "",
-    "text": "prompt_strategies.metharme\nModule containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class\n\n\n\n\n\nName\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
+    "text": "prompt_tokenizers\nModule containing PromptTokenizingStrategy and Prompter classes\n\n\n\n\n\nName\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
   },
   {
-    "objectID": "docs/api/prompt_strategies.metharme.html#classes",
-    "href": "docs/api/prompt_strategies.metharme.html#classes",
-    "title": "prompt_strategies.metharme",
+    "objectID": "docs/api/prompt_tokenizers.html#classes",
+    "href": "docs/api/prompt_tokenizers.html#classes",
+    "title": "prompt_tokenizers",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nMetharmePromptTokenizingStrategy\nTokenizing strategy for the Metharme models\n\n\nMetharmePrompter\nPrompter for the Metharme models.\n\n\n\n\n\nprompt_strategies.metharme.MetharmePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for the Metharme models\n\n\n\nprompt_strategies.metharme.MetharmePrompter(*args, **kwargs)\nPrompter for the Metharme models."
+    "text": "Name\nDescription\n\n\n\n\nAlpacaMultipleChoicePromptTokenizingStrategy\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\nAlpacaPromptTokenizingStrategy\nTokenizing strategy for Alpaca prompts.\n\n\nAlpacaReflectionPTStrategy\nTokenizing strategy for Alpaca Reflection prompts.\n\n\nDatasetWrappingStrategy\nAbstract class for wrapping datasets for Chat Messages\n\n\nGPTeacherPromptTokenizingStrategy\nTokenizing strategy for GPTeacher prompts.\n\n\nInstructionPromptTokenizingStrategy\nTokenizing strategy for instruction-based prompts.\n\n\nInvalidDataException\nException raised when the data is invalid\n\n\nJeopardyPromptTokenizingStrategy\nTokenizing strategy for Jeopardy prompts.\n\n\nNomicGPT4AllPromptTokenizingStrategy\nTokenizing strategy for NomicGPT4All prompts.\n\n\nOpenAssistantPromptTokenizingStrategy\nTokenizing strategy for OpenAssistant prompts.\n\n\nPromptTokenizingStrategy\nAbstract class for tokenizing strategies\n\n\nReflectionPromptTokenizingStrategy\nTokenizing strategy for Reflection prompts.\n\n\nSummarizeTLDRPromptTokenizingStrategy\nTokenizing strategy for SummarizeTLDR prompts.\n\n\n\n\n\nprompt_tokenizers.AlpacaMultipleChoicePromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Multiple Choice prompts.\n\n\n\nprompt_tokenizers.AlpacaPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca prompts.\n\n\n\nprompt_tokenizers.AlpacaReflectionPTStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Alpaca Reflection prompts.\n\n\n\nprompt_tokenizers.DatasetWrappingStrategy()\nAbstract class for wrapping datasets for Chat Messages\n\n\n\nprompt_tokenizers.GPTeacherPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for GPTeacher prompts.\n\n\n\nprompt_tokenizers.InstructionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for instruction-based prompts.\n\n\n\nprompt_tokenizers.InvalidDataException()\nException raised when the data is invalid\n\n\n\nprompt_tokenizers.JeopardyPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Jeopardy prompts.\n\n\n\nprompt_tokenizers.NomicGPT4AllPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for NomicGPT4All prompts.\n\n\n\nprompt_tokenizers.OpenAssistantPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for OpenAssistant prompts.\n\n\n\nprompt_tokenizers.PromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nAbstract class for tokenizing strategies\n\n\n\nprompt_tokenizers.ReflectionPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for Reflection prompts.\n\n\n\nprompt_tokenizers.SummarizeTLDRPromptTokenizingStrategy(\n    prompter,\n    tokenizer,\n    train_on_inputs=False,\n    sequence_len=2048,\n)\nTokenizing strategy for SummarizeTLDR prompts."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html",
-    "title": "prompt_strategies.dpo.chatml",
+    "objectID": "docs/api/prompt_tokenizers.html#functions",
+    "href": "docs/api/prompt_tokenizers.html#functions",
+    "title": "prompt_tokenizers",
     "section": "",
-    "text": "prompt_strategies.dpo.chatml\nDPO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Name\nDescription\n\n\n\n\nparse_tokenized_to_result\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\ntokenize_prompt_default\nReturns the default values for the tokenize prompt function\n\n\n\n\n\nprompt_tokenizers.parse_tokenized_to_result(\n    result,\n    current_len,\n    res,\n    labels,\n    pad_token_id=None,\n)\nParses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result\n\n\n\nprompt_tokenizers.tokenize_prompt_default()\nReturns the default values for the tokenize prompt function"
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.chatml.html#functions",
-    "title": "prompt_strategies.dpo.chatml",
+    "objectID": "docs/api/utils.callbacks.comet_.html",
+    "href": "docs/api/utils.callbacks.comet_.html",
+    "title": "utils.callbacks.comet_",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.chatml.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.chatml.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "utils.callbacks.comet_\nComet module for trainer callbacks\n\n\n\n\n\nName\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
   },
   {
-    "objectID": "docs/api/core.builders.rl.html",
-    "href": "docs/api/core.builders.rl.html",
-    "title": "core.builders.rl",
+    "objectID": "docs/api/utils.callbacks.comet_.html#classes",
+    "href": "docs/api/utils.callbacks.comet_.html#classes",
+    "title": "utils.callbacks.comet_",
     "section": "",
-    "text": "core.builders.rl\nBuilder for RLHF trainers\n\n\n\n\n\nName\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
+    "text": "Name\nDescription\n\n\n\n\nSaveAxolotlConfigtoCometCallback\nCallback to save axolotl config to comet\n\n\n\n\n\nutils.callbacks.comet_.SaveAxolotlConfigtoCometCallback(axolotl_config_path)\nCallback to save axolotl config to comet"
   },
   {
-    "objectID": "docs/api/core.builders.rl.html#classes",
-    "href": "docs/api/core.builders.rl.html#classes",
-    "title": "core.builders.rl",
+    "objectID": "docs/api/core.datasets.transforms.chat_builder.html",
+    "href": "docs/api/core.datasets.transforms.chat_builder.html",
+    "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nHFRLTrainerBuilder\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)\n\n\n\n\n\ncore.builders.rl.HFRLTrainerBuilder(cfg, model, tokenizer, processor=None)\nTrainer factory class for TRL-based RLHF trainers (e.g. DPO)"
+    "text": "core.datasets.transforms.chat_builder\nThis module contains a function that builds a transform that takes a row from the\ndataset and converts it to a Chat.\n\n\n\n\n\nName\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
-    "objectID": "docs/api/cli.utils.sweeps.html",
-    "href": "docs/api/cli.utils.sweeps.html",
-    "title": "cli.utils.sweeps",
+    "objectID": "docs/api/core.datasets.transforms.chat_builder.html#functions",
+    "href": "docs/api/core.datasets.transforms.chat_builder.html#functions",
+    "title": "core.datasets.transforms.chat_builder",
     "section": "",
-    "text": "cli.utils.sweeps\nUtilities for handling sweeps over configs for axolotl train CLI command\n\n\n\n\n\nName\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
+    "text": "Name\nDescription\n\n\n\n\nchat_message_transform_builder\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\ncore.datasets.transforms.chat_builder.chat_message_transform_builder(\n    train_on_inputs=False,\n    conversations_field='conversations',\n    message_field_role=None,\n    message_field_content=None,\n    message_field_training=None,\n)\nBuilds a transform that takes a row from the dataset and converts it to a Chat\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrain_on_inputs\nbool\nIf True, the transform will train on the inputs. If False, the transform will train on the targets. Defaults to False.\nFalse\n\n\nconversations_field\nstr\nThe field name of the conversations. Defaults to “conversations”.\n'conversations'\n\n\nmessage_field_role\nstr | list[str]\nThe field name of the role. Defaults to “role”.\nNone\n\n\nmessage_field_content\nstr | list[str]\nThe field name of the message content. Defaults to “content”.\nNone\n\n\nmessage_field_training\nstr | list[str]\nThe field name of the train/weight. Defaults to “weight”.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nCallable\n\nA function that takes a list of conversations and returns a list of messages."
   },
   {
-    "objectID": "docs/api/cli.utils.sweeps.html#functions",
-    "href": "docs/api/cli.utils.sweeps.html#functions",
-    "title": "cli.utils.sweeps",
+    "objectID": "docs/api/utils.trainer.html",
+    "href": "docs/api/utils.trainer.html",
+    "title": "utils.trainer",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ngenerate_sweep_configs\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\ncli.utils.sweeps.generate_sweep_configs(base_config, sweeps_config)\nRecursively generates all possible configurations by applying sweeps to the base config.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nbase_config\ndict\nThe original configuration dictionary\nrequired\n\n\nsweeps_config\ndict\nDictionary where keys are parameters and values are either: - lists of values to sweep independently - or for paired values, a list of dicts under the ’_’ key\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nlist\nlist[dict[str, Any]]\nList of all possible configuration dictionaries\n\n\n\n\n\n\nsweeps_config = {\n‘learning_rate’: [0.1, 0.01],\n’_’: [\n{‘load_in_8bit’: True, ‘adapter’: ‘lora’},\n{‘load_in_4bit’: True, ‘adapter’: ‘qlora’}\n]\n}"
+    "text": "utils.trainer\nModule containing the Trainer class and related functions\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
   },
   {
-    "objectID": "docs/api/cli.quantize.html",
-    "href": "docs/api/cli.quantize.html",
-    "title": "cli.quantize",
+    "objectID": "docs/api/utils.trainer.html#functions",
+    "href": "docs/api/utils.trainer.html#functions",
+    "title": "utils.trainer",
     "section": "",
-    "text": "cli.quantize\nCLI to post-training quantize a model using torchao\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
+    "text": "Name\nDescription\n\n\n\n\nadd_pose_position_ids\nuse the PoSE technique to extend the context length by randomly skipping\n\n\nadd_position_ids\nHandle both single-example and batched data.\n\n\ndrop_long_seq\nDrop samples whose sequence length is either too long (&gt; sequence_len)\n\n\nsetup_trainer\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\nutils.trainer.add_pose_position_ids(\n    sample,\n    max_context_len=32768,\n    split_on_token_ids=None,\n    chunks=2,\n)\nuse the PoSE technique to extend the context length by randomly skipping\npositions in the context. We only want to skip right before tokens in\nthe split_on_token_ids list. We should attempt to randomly distribute\nthe skips, but we don’t need the final position_ids to be the full\ncontext_len. There may be multiple turns in the context, so we want to\nmake sure we take into account the maximum possible number of skips\nremaining in each sample.\n\n\n\nutils.trainer.add_position_ids(sample)\nHandle both single-example and batched data.\n- single example: sample[‘input_ids’] is a list[int]\n- batched data: sample[‘input_ids’] is a list[list[int]]\n\n\n\nutils.trainer.drop_long_seq(sample, sequence_len=2048, min_sequence_len=2)\nDrop samples whose sequence length is either too long (&gt; sequence_len)\nor too short (&lt; min_sequence_len).\nWorks for both single-example (list[int]) or batched (list[list[int]]).\n\n\n\nutils.trainer.setup_trainer(\n    cfg,\n    train_dataset,\n    eval_dataset,\n    model,\n    tokenizer,\n    processor,\n    total_num_steps,\n    model_ref=None,\n    peft_config=None,\n)\nHelper method for instantiating and building a (causal or RLHF) trainer.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nAxolotl config object containing training parameters.\nrequired\n\n\ntrain_dataset\n\nDataset to use for training.\nrequired\n\n\neval_dataset\n\nDataset to use for evaluation.\nrequired\n\n\nmodel\n\nThe model to train.\nrequired\n\n\ntokenizer\n\nTokenizer for processing text input.\nrequired\n\n\nprocessor\n\nProcessor for data preparation.\nrequired\n\n\ntotal_num_steps\n\nThe total number of training steps.\nrequired\n\n\nmodel_ref\n\nOptional reference model for RLHF training. Default is None.\nNone\n\n\npeft_config\n\nOptional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\n\nA trainer instance (either HFRLTrainer or HFCausalTrainer) configured based on the provided parameters."
   },
   {
-    "objectID": "docs/api/cli.quantize.html#functions",
-    "href": "docs/api/cli.quantize.html#functions",
-    "title": "cli.quantize",
+    "objectID": "docs/api/cli.train.html",
+    "href": "docs/api/cli.train.html",
+    "title": "cli.train",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_quantize\nQuantizes a model’s model’s weights\n\n\n\n\n\ncli.quantize.do_quantize(config, cli_args)\nQuantizes a model’s model’s weights\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nThe path to the config file\nrequired\n\n\ncli_args\ndict\nAdditional command-line arguments\nrequired"
+    "text": "cli.train\nCLI to run training on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html",
-    "href": "docs/api/cli.vllm_serve.html",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/cli.train.html#functions",
+    "href": "docs/api/cli.train.html#functions",
+    "title": "cli.train",
     "section": "",
-    "text": "cli.vllm_serve\nCLI to start the vllm server for online RL\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_train.\n\n\ndo_train\nTrains a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.train.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_train.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.train.do_train(cfg, cli_args)\nTrains a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.train.train. Also runs the plugin\nmanager’s post_train_unload once training completes.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nTraining-specific CLI arguments.\nrequired"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html#classes",
-    "href": "docs/api/cli.vllm_serve.html#classes",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/integrations.lm_eval.args.html",
+    "href": "docs/api/integrations.lm_eval.args.html",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlScriptArguments\nAdditional arguments for the VLLM server\n\n\n\n\n\ncli.vllm_serve.AxolotlScriptArguments(\n    reasoning_parser='',\n    enable_reasoning=None,\n)\nAdditional arguments for the VLLM server"
+    "text": "integrations.lm_eval.args\nModule for handling lm eval harness input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/cli.vllm_serve.html#functions",
-    "href": "docs/api/cli.vllm_serve.html#functions",
-    "title": "cli.vllm_serve",
+    "objectID": "docs/api/integrations.lm_eval.args.html#classes",
+    "href": "docs/api/integrations.lm_eval.args.html#classes",
+    "title": "integrations.lm_eval.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_vllm_serve\nStarts the VLLM server for serving LLM models used for online RL\n\n\n\n\n\ncli.vllm_serve.do_vllm_serve(config, cli_args)\nStarts the VLLM server for serving LLM models used for online RL\nArgs\n:param cfg: Parsed doct of the YAML config\n:param cli_args: dict of additional command-line arguments of type VllmServeCliArgs\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\nprocess_id\n\nthe process id of the started VLLM server"
+    "text": "Name\nDescription\n\n\n\n\nLMEvalArgs\nInput args for lm eval harness\n\n\n\n\n\nintegrations.lm_eval.args.LMEvalArgs()\nInput args for lm eval harness"
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html",
-    "href": "docs/api/prompt_strategies.input_output.html",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/prompt_strategies.messages.chat.html",
+    "href": "docs/api/prompt_strategies.messages.chat.html",
+    "title": "prompt_strategies.messages.chat",
     "section": "",
-    "text": "prompt_strategies.input_output\nModule for plain input/output prompt pairs\n\n\n\n\n\nName\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "prompt_strategies.messages.chat\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nName\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
   },
   {
-    "objectID": "docs/api/prompt_strategies.input_output.html#classes",
-    "href": "docs/api/prompt_strategies.input_output.html#classes",
-    "title": "prompt_strategies.input_output",
+    "objectID": "docs/api/prompt_strategies.messages.chat.html#classes",
+    "href": "docs/api/prompt_strategies.messages.chat.html#classes",
+    "title": "prompt_strategies.messages.chat",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nRawInputOutputPrompter\nprompter for raw i/o data\n\n\nRawInputOutputStrategy\nPrompt Strategy class for input/output pairs\n\n\n\n\n\nprompt_strategies.input_output.RawInputOutputPrompter()\nprompter for raw i/o data\n\n\n\nprompt_strategies.input_output.RawInputOutputStrategy(\n    *args,\n    eos_token=None,\n    **kwargs,\n)\nPrompt Strategy class for input/output pairs"
+    "text": "Name\nDescription\n\n\n\n\nChatMessageDatasetWrappingStrategy\nChat dataset wrapping strategy for new internal messages representations\n\n\n\n\n\nprompt_strategies.messages.chat.ChatMessageDatasetWrappingStrategy(\n    processor,\n    message_transform=None,\n    formatter=None,\n    **kwargs,\n)\nChat dataset wrapping strategy for new internal messages representations"
   },
   {
-    "objectID": "docs/api/evaluate.html",
-    "href": "docs/api/evaluate.html",
-    "title": "evaluate",
+    "objectID": "docs/api/utils.schemas.peft.html",
+    "href": "docs/api/utils.schemas.peft.html",
+    "title": "utils.schemas.peft",
     "section": "",
-    "text": "evaluate\nModule for evaluating models.\n\n\n\n\n\nName\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
+    "text": "utils.schemas.peft\nPydantic models for PEFT-related configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
   },
   {
-    "objectID": "docs/api/evaluate.html#functions",
-    "href": "docs/api/evaluate.html#functions",
-    "title": "evaluate",
+    "objectID": "docs/api/utils.schemas.peft.html#classes",
+    "href": "docs/api/utils.schemas.peft.html#classes",
+    "title": "utils.schemas.peft",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nevaluate\nEvaluate a model on training and validation datasets.\n\n\nevaluate_dataset\nHelper function to evaluate a single dataset.\n\n\n\n\n\nevaluate.evaluate(cfg, dataset_meta)\nEvaluate a model on training and validation datasets.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nDataset metadata containing training and evaluation datasets.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nDict[str, float]\nDictionary mapping metric names to their values.\n\n\n\n\n\n\n\nevaluate.evaluate_dataset(trainer, dataset, dataset_type, flash_optimum=False)\nHelper function to evaluate a single dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ntrainer\nTrainer\nThe trainer instance.\nrequired\n\n\ndataset\nDataset\nDataset to evaluate.\nrequired\n\n\ndataset_type\nstr\nType of dataset (‘train’ or ‘eval’).\nrequired\n\n\nflash_optimum\nbool\nWhether to use flash optimum.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nOptional[Dict[str, float]]\nDictionary of metrics or None if dataset is None."
+    "text": "Name\nDescription\n\n\n\n\nLoftQConfig\nLoftQ configuration subset\n\n\nLoraConfig\nPeft / LoRA configuration subset\n\n\nPeftConfig\npeftq configuration subset\n\n\nReLoRAConfig\nReLoRA configuration subset\n\n\n\n\n\nutils.schemas.peft.LoftQConfig()\nLoftQ configuration subset\n\n\n\nutils.schemas.peft.LoraConfig()\nPeft / LoRA configuration subset\n\n\n\nutils.schemas.peft.PeftConfig()\npeftq configuration subset\n\n\n\nutils.schemas.peft.ReLoRAConfig()\nReLoRA configuration subset"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html",
-    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
+    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html",
+    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html",
+    "title": "prompt_strategies.bradley_terry.llama3",
     "section": "",
-    "text": "monkeypatch.gradient_checkpointing.offload_cpu\nCPU offloaded checkpointing\n\n\n\n\n\nName\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "prompt_strategies.bradley_terry.llama3\nchatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
   },
   {
-    "objectID": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
-    "href": "docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html#classes",
-    "title": "monkeypatch.gradient_checkpointing.offload_cpu",
+    "objectID": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
+    "href": "docs/api/prompt_strategies.bradley_terry.llama3.html#functions",
+    "title": "prompt_strategies.bradley_terry.llama3",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nCPU_Offloaded_Gradient_Checkpointer\nSaves VRAM by smartly offloading to RAM.\n\n\n\n\n\nmonkeypatch.gradient_checkpointing.offload_cpu.CPU_Offloaded_Gradient_Checkpointer(\n)\nSaves VRAM by smartly offloading to RAM.\nTiny hit to performance, since we mask the movement via non blocking calls."
+    "text": "Name\nDescription\n\n\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\n\n\n\nprompt_strategies.bradley_terry.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs"
   },
   {
-    "objectID": "docs/api/cli.args.html",
-    "href": "docs/api/cli.args.html",
-    "title": "cli.args",
+    "objectID": "docs/api/cli.utils.args.html",
+    "href": "docs/api/cli.utils.args.html",
+    "title": "cli.utils.args",
     "section": "",
-    "text": "cli.args\nModule for axolotl CLI command arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+    "text": "cli.utils.args\nUtilities for axolotl CLI args.\n\n\n\n\n\nName\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
   },
   {
-    "objectID": "docs/api/cli.args.html#classes",
-    "href": "docs/api/cli.args.html#classes",
-    "title": "cli.args",
+    "objectID": "docs/api/cli.utils.args.html#functions",
+    "href": "docs/api/cli.utils.args.html#functions",
+    "title": "cli.utils.args",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nEvaluateCliArgs\nDataclass with CLI arguments for axolotl evaluate command.\n\n\nInferenceCliArgs\nDataclass with CLI arguments for axolotl inference command.\n\n\nPreprocessCliArgs\nDataclass with CLI arguments for axolotl preprocess command.\n\n\nQuantizeCliArgs\nDataclass with CLI arguments for axolotl quantize command.\n\n\nTrainerCliArgs\nDataclass with CLI arguments for axolotl train command.\n\n\nVllmServeCliArgs\nDataclass with CLI arguments for axolotl vllm-serve command.\n\n\n\n\n\ncli.args.EvaluateCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n)\nDataclass with CLI arguments for axolotl evaluate command.\n\n\n\ncli.args.InferenceCliArgs(prompter=None)\nDataclass with CLI arguments for axolotl inference command.\n\n\n\ncli.args.PreprocessCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=1,\n    prompter=None,\n    download=True,\n    iterable=None,\n)\nDataclass with CLI arguments for axolotl preprocess command.\n\n\n\ncli.args.QuantizeCliArgs(\n    base_model=None,\n    weight_dtype=None,\n    activation_dtype=None,\n    quantize_embedding=None,\n    group_size=None,\n    output_dir=None,\n)\nDataclass with CLI arguments for axolotl quantize command.\n\n\n\ncli.args.TrainerCliArgs(\n    debug=False,\n    debug_text_only=False,\n    debug_num_examples=0,\n    prompter=None,\n    shard=False,\n)\nDataclass with CLI arguments for axolotl train command.\n\n\n\ncli.args.VllmServeCliArgs(\n    tensor_parallel_size=None,\n    data_parallel_size=None,\n    host=None,\n    port=None,\n    gpu_memory_utilization=None,\n    dtype=None,\n    max_model_len=None,\n    enable_prefix_caching=None,\n    serve_module=None,\n    enable_reasoning=None,\n    reasoning_parser=None,\n)\nDataclass with CLI arguments for axolotl vllm-serve command."
+    "text": "Name\nDescription\n\n\n\n\nadd_options_from_config\nCreate Click options from the fields of a Pydantic model.\n\n\nadd_options_from_dataclass\nCreate Click options from the fields of a dataclass.\n\n\nfilter_none_kwargs\nWraps function to remove None-valued kwargs.\n\n\n\n\n\ncli.utils.args.add_options_from_config(config_class)\nCreate Click options from the fields of a Pydantic model.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[BaseModel]\nPyDantic model with fields to parse from the CLI\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.add_options_from_dataclass(config_class)\nCreate Click options from the fields of a dataclass.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig_class\nType[Any]\nDataclass with fields to parse from the CLI.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nFunction decorator for Axolotl CLI command.\n\n\n\n\n\n\n\ncli.utils.args.filter_none_kwargs(func)\nWraps function to remove None-valued kwargs.\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nfunc\nCallable\nFunction to wrap.\nrequired\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nCallable\nWrapped function."
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html",
-    "href": "docs/api/prompt_strategies.kto.chatml.html",
-    "title": "prompt_strategies.kto.chatml",
+    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html",
+    "href": "docs/api/prompt_strategies.stepwise_supervised.html",
+    "title": "prompt_strategies.stepwise_supervised",
     "section": "",
-    "text": "prompt_strategies.kto.chatml\nKTO strategies for chatml\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "prompt_strategies.stepwise_supervised\nModule for stepwise datasets, typically including a prompt and reasoning traces,\nand (optionally) per-step, or per-prompt-trace labels for reward modelling.\n\n\n\n\n\nName\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
   },
   {
-    "objectID": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "href": "docs/api/prompt_strategies.kto.chatml.html#functions",
-    "title": "prompt_strategies.kto.chatml",
+    "objectID": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
+    "href": "docs/api/prompt_strategies.stepwise_supervised.html#classes",
+    "title": "prompt_strategies.stepwise_supervised",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/kto-mix-15k conversations\n\n\nintel\nFor Intel Orca KTO\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.kto.chatml.argilla_chat(cfg, **kwargs)\nfor argilla/kto-mix-15k conversations\n\n\n\nprompt_strategies.kto.chatml.intel(cfg, **kwargs)\nFor Intel Orca KTO\nex: argilla/distilabel-intel-orca-kto\n\n\n\nprompt_strategies.kto.chatml.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations\nex: argilla/ultrafeedback-binarized-preferences-cleaned-kto"
+    "text": "Name\nDescription\n\n\n\n\nStepwiseSupervisedPromptTokenizingStrategy\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\n\n\n\n\n\nprompt_strategies.stepwise_supervised.StepwiseSupervisedPromptTokenizingStrategy(\n    tokenizer,\n    sequence_len=2048,\n    step_separator='\\n',\n    max_completion_length=None,\n    train_on_last_step_only=False,\n)\nTokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.\nThese datasets should include the following columns:\n- prompt: the prompt text\n- completions: a list of n completion steps\n- labels: a list of n labels indicating the “correctness” of each step"
   },
   {
-    "objectID": "docs/api/core.chat.format.llama3x.html",
-    "href": "docs/api/core.chat.format.llama3x.html",
-    "title": "core.chat.format.llama3x",
+    "objectID": "docs/api/common.datasets.html",
+    "href": "docs/api/common.datasets.html",
+    "title": "common.datasets",
     "section": "",
-    "text": "core.chat.format.llama3x\ncore.chat.format.llama3x\nLlama 3.x chat formatting functions for MessageContents"
+    "text": "common.datasets\nDataset loading utilities.\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/common.datasets.html#classes",
+    "href": "docs/api/common.datasets.html#classes",
+    "title": "common.datasets",
     "section": "",
-    "text": "prompt_strategies.dpo.llama3\nDPO strategies for llama-3 chat template\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Name\nDescription\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and validation datasets and metadata.\n\n\n\n\n\ncommon.datasets.TrainDatasetMeta(\n    train_dataset,\n    eval_dataset=None,\n    total_num_steps=None,\n)\nDataclass with fields for training and validation datasets and metadata."
   },
   {
-    "objectID": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "href": "docs/api/prompt_strategies.dpo.llama3.html#functions",
-    "title": "prompt_strategies.dpo.llama3",
+    "objectID": "docs/api/common.datasets.html#functions",
+    "href": "docs/api/common.datasets.html#functions",
+    "title": "common.datasets",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nargilla_chat\nfor argilla/dpo-mix-7k conversations\n\n\nicr\nchatml transforms for datasets with system, input, chosen, rejected\n\n\nintel\nFor Intel Orca DPO Pairs\n\n\nultra\nfor ultrafeedback binarized conversations\n\n\n\n\n\nprompt_strategies.dpo.llama3.argilla_chat(cfg, **kwargs)\nfor argilla/dpo-mix-7k conversations\n\n\n\nprompt_strategies.dpo.llama3.icr(cfg, **kwargs)\nchatml transforms for datasets with system, input, chosen, rejected\nex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs\n\n\n\nprompt_strategies.dpo.llama3.intel(cfg, **kwargs)\nFor Intel Orca DPO Pairs\n\n\n\nprompt_strategies.dpo.llama3.ultra(cfg, **kwargs)\nfor ultrafeedback binarized conversations"
+    "text": "Name\nDescription\n\n\n\n\nload_datasets\nLoads one or more training or evaluation datasets, calling\n\n\nload_preference_datasets\nLoads one or more training or evaluation datasets for RL training using paired\n\n\nsample_dataset\nRandomly sample num_samples samples with replacement from dataset.\n\n\n\n\n\ncommon.datasets.load_datasets(cfg, cli_args=None, debug=False)\nLoads one or more training or evaluation datasets, calling\naxolotl.utils.data.prepare_datasets. Optionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\ndebug\nbool\nWhether to print out tokenization of sample. This is duplicated in cfg and cli_args, but is kept due to use in our Colab notebooks.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed total_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.load_preference_datasets(cfg, cli_args=None)\nLoads one or more training or evaluation datasets for RL training using paired\npreference data, calling axolotl.utils.data.rl.prepare_preference_datasets.\nOptionally, logs out debug information.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nPreprocessCliArgs | TrainerCliArgs | None\nCommand-specific CLI arguments.\nNone\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nTrainDatasetMeta\nDataclass with fields for training and evaluation datasets and the computed\n\n\n\nTrainDatasetMeta\ntotal_num_steps.\n\n\n\n\n\n\n\ncommon.datasets.sample_dataset(dataset, num_samples)\nRandomly sample num_samples samples with replacement from dataset."
   },
   {
-    "objectID": "docs/api/cli.art.html",
-    "href": "docs/api/cli.art.html",
-    "title": "cli.art",
+    "objectID": "docs/api/prompt_strategies.kto.user_defined.html",
+    "href": "docs/api/prompt_strategies.kto.user_defined.html",
+    "title": "prompt_strategies.kto.user_defined",
     "section": "",
-    "text": "cli.art\nAxolotl ASCII logo utils.\n\n\n\n\n\nName\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
+    "text": "prompt_strategies.kto.user_defined\nprompt_strategies.kto.user_defined\nUser-defined KTO strategies"
   },
   {
-    "objectID": "docs/api/cli.art.html#functions",
-    "href": "docs/api/cli.art.html#functions",
-    "title": "cli.art",
+    "objectID": "docs/api/train.html",
+    "href": "docs/api/train.html",
+    "title": "train",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nprint_axolotl_text_art\nPrints axolotl ASCII art.\n\n\n\n\n\ncli.art.print_axolotl_text_art()\nPrints axolotl ASCII art."
+    "text": "train\nPrepare and train a model on a dataset. Can also infer from a model or merge lora\n\n\n\n\n\nName\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(\n    cfg,\n    model,\n    tokenizer,\n    train_dataset,\n    safe_serialization,\n)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model, safe_serialization)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model, safe_serialization)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
   },
   {
-    "objectID": "docs/api/utils.schemas.model.html",
-    "href": "docs/api/utils.schemas.model.html",
-    "title": "utils.schemas.model",
+    "objectID": "docs/api/train.html#functions",
+    "href": "docs/api/train.html#functions",
+    "title": "train",
     "section": "",
-    "text": "utils.schemas.model\nPydantic models for model input / output, etc. configuration\n\n\n\n\n\nName\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
+    "text": "Name\nDescription\n\n\n\n\ncreate_model_card\nCreate a model card for the trained model if needed.\n\n\nexecute_training\nExecute the training process with appropriate SDP kernel configurations.\n\n\nhandle_untrained_tokens_fix\nApply fixes for untrained tokens if configured.\n\n\nsave_initial_configs\nSave initial configurations before training.\n\n\nsave_trained_model\nSave the trained model according to configuration and training setup.\n\n\nsetup_model_and_tokenizer\nLoad the tokenizer, processor (for multimodal models), and model based on\n\n\nsetup_model_and_trainer\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\n\n\nsetup_model_card\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\nsetup_reference_model\nSet up the reference model for RL training if needed.\n\n\nsetup_signal_handler\nSet up signal handler for graceful termination.\n\n\ntrain\nTrain a model on the given dataset.\n\n\n\n\n\ntrain.create_model_card(cfg, trainer)\nCreate a model card for the trained model if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nTrainer\nThe trainer object with model card creation capabilities.\nrequired\n\n\n\n\n\n\n\ntrain.execute_training(cfg, trainer, resume_from_checkpoint)\nExecute the training process with appropriate SDP kernel configurations.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe configured trainer object.\nrequired\n\n\nresume_from_checkpoint\nstr | None\nPath to checkpoint to resume from, if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.handle_untrained_tokens_fix(\n    cfg,\n    model,\n    tokenizer,\n    train_dataset,\n    safe_serialization,\n)\nApply fixes for untrained tokens if configured.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to apply fixes to.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer for token identification.\nrequired\n\n\ntrain_dataset\nDataset\nThe training dataset to use.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving.\nrequired\n\n\n\n\n\n\n\ntrain.save_initial_configs(cfg, tokenizer, model, peft_config, processor)\nSave initial configurations before training.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to save.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save configuration for.\nrequired\n\n\npeft_config\nPeftConfig | None\nThe PEFT configuration to save if applicable.\nrequired\n\n\n\n\n\n\n\ntrain.save_trained_model(cfg, trainer, model, safe_serialization)\nSave the trained model according to configuration and training setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntrainer\nAny\nThe trainer object.\nrequired\n\n\nmodel\nPreTrainedModel\nThe trained model to save.\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization.\nrequired\n\n\n\n\n\n\n\ntrain.setup_model_and_tokenizer(cfg)\nLoad the tokenizer, processor (for multimodal models), and model based on\nconfiguration.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple containing model, tokenizer, peft_config (if LoRA / QLoRA, else None), and processor (if multimodal, else None).\n\n\n\n\n\n\n\ntrain.setup_model_and_trainer(cfg, dataset_meta)\nLoad model, tokenizer, trainer, etc. Helper function to encapsulate the full\ntrainer setup.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters.\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple['HFRLTrainerBuilder' | 'HFCausalTrainerBuilder', PeftModel | PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None]\nTuple of: - Trainer (Causal or RLHF) - Model - Tokenizer - PEFT config - Processor\n\n\n\n\n\n\n\ntrain.setup_model_card(cfg)\nSet up the Axolotl badge and add the Axolotl config to the model card if available.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\n\n\n\n\n\ntrain.setup_reference_model(cfg, tokenizer)\nSet up the reference model for RL training if needed.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ntokenizer\nPreTrainedTokenizer\nThe tokenizer to use for the reference model.\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nPreTrainedModel | None\nReference model if needed for RL training, None otherwise.\n\n\n\n\n\n\n\ntrain.setup_signal_handler(cfg, model, safe_serialization)\nSet up signal handler for graceful termination.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\nmodel\nPreTrainedModel\nThe model to save on termination\nrequired\n\n\nsafe_serialization\nbool\nWhether to use safe serialization when saving\nrequired\n\n\n\n\n\n\n\ntrain.train(cfg, dataset_meta)\nTrain a model on the given dataset.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nThe configuration dictionary with training parameters\nrequired\n\n\ndataset_meta\nTrainDatasetMeta\nObject with training, validation datasets and metadata\nrequired\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\ntuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]\nTuple of (model, tokenizer) after training"
   },
   {
-    "objectID": "docs/api/utils.schemas.model.html#classes",
-    "href": "docs/api/utils.schemas.model.html#classes",
-    "title": "utils.schemas.model",
+    "objectID": "docs/multi-node.html",
+    "href": "docs/multi-node.html",
+    "title": "Multi Node",
     "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModelInputConfig\nModel configuration subset\n\n\nModelOutputConfig\nmodel save configuration subset\n\n\nSpecialTokensConfig\nSpecial tokens configuration subset\n\n\n\n\n\nutils.schemas.model.ModelInputConfig()\nModel configuration subset\n\n\n\nutils.schemas.model.ModelOutputConfig()\nmodel save configuration subset\n\n\n\nutils.schemas.model.SpecialTokensConfig()\nSpecial tokens configuration subset"
-  },
-  {
-    "objectID": "docs/api/cli.evaluate.html",
-    "href": "docs/api/cli.evaluate.html",
-    "title": "cli.evaluate",
-    "section": "",
-    "text": "cli.evaluate\nCLI to run evaluation on a model.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
-  },
-  {
-    "objectID": "docs/api/cli.evaluate.html#functions",
-    "href": "docs/api/cli.evaluate.html#functions",
-    "title": "cli.evaluate",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\ndo_evaluate\nEvaluates a transformers model by first loading the dataset(s) specified in the\n\n\n\n\n\ncli.evaluate.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls do_evaluate.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.evaluate.do_evaluate(cfg, cli_args)\nEvaluates a transformers model by first loading the dataset(s) specified in the\naxolotl config, and then calling axolotl.evaluate.evaluate, which computes\nevaluation metrics on the given dataset(s) and writes them to disk.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\nDictDefault\nDictionary mapping axolotl config keys to values.\nrequired\n\n\ncli_args\nTrainerCliArgs\nCLI arguments.\nrequired"
-  },
-  {
-    "objectID": "docs/api/cli.delinearize_llama4.html",
-    "href": "docs/api/cli.delinearize_llama4.html",
-    "title": "cli.delinearize_llama4",
-    "section": "",
-    "text": "cli.delinearize_llama4\nCLI tool to delinearize quantized/Linearized Llama-4 models.\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
-  },
-  {
-    "objectID": "docs/api/cli.delinearize_llama4.html#functions",
-    "href": "docs/api/cli.delinearize_llama4.html#functions",
-    "title": "cli.delinearize_llama4",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nConvert a patched HF format Llama4 model (with separated projections)\n\n\n\n\n\ncli.delinearize_llama4.do_cli(model, output)\nConvert a patched HF format Llama4 model (with separated projections)\nback to the original HF format (with fused projections).\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nmodel\nUnion[Path, str]\nPath to the patched HF model\nrequired\n\n\noutput\nUnion[Path, str]\nPath to save the converted model\nrequired"
-  },
-  {
-    "objectID": "docs/api/integrations.kd.trainer.html",
-    "href": "docs/api/integrations.kd.trainer.html",
-    "title": "integrations.kd.trainer",
-    "section": "",
-    "text": "integrations.kd.trainer\nKD trainer\n\n\n\n\n\nName\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
-  },
-  {
-    "objectID": "docs/api/integrations.kd.trainer.html#classes",
-    "href": "docs/api/integrations.kd.trainer.html#classes",
-    "title": "integrations.kd.trainer",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nAxolotlKDTrainer\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer(*args, **kwargs)\nCustom trainer subclass for Knowledge Distillation (KD)\n\n\n\n\n\nName\nDescription\n\n\n\n\ncompute_loss\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\n\n\n\n\n\nintegrations.kd.trainer.AxolotlKDTrainer.compute_loss(\n    model,\n    inputs,\n    return_outputs=False,\n    num_items_in_batch=None,\n)\nHow the loss is computed by Trainer. By default, all models return the loss in the first element.\nSubclass and override for custom behavior."
-  },
-  {
-    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
-    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html",
-    "title": "monkeypatch.stablelm_attn_hijack_flash",
-    "section": "",
-    "text": "monkeypatch.stablelm_attn_hijack_flash\nPyTorch StableLM Epoch model.\n\n\n\n\n\nName\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
-  },
-  {
-    "objectID": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
-    "href": "docs/api/monkeypatch.stablelm_attn_hijack_flash.html#functions",
-    "title": "monkeypatch.stablelm_attn_hijack_flash",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nrepeat_kv\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\n\n\nrotate_half\nRotates half the hidden dims of the input.\n\n\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.repeat_kv(hidden_states, n_rep)\nThis is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,\nnum_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)\n\n\n\nmonkeypatch.stablelm_attn_hijack_flash.rotate_half(x)\nRotates half the hidden dims of the input."
-  },
-  {
-    "objectID": "docs/api/cli.cloud.modal_.html",
-    "href": "docs/api/cli.cloud.modal_.html",
-    "title": "cli.cloud.modal_",
-    "section": "",
-    "text": "cli.cloud.modal_\nModal Cloud support from CLI\n\n\n\n\n\nName\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
-  },
-  {
-    "objectID": "docs/api/cli.cloud.modal_.html#classes",
-    "href": "docs/api/cli.cloud.modal_.html#classes",
-    "title": "cli.cloud.modal_",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nModalCloud\nModal Cloud implementation.\n\n\n\n\n\ncli.cloud.modal_.ModalCloud(config, app=None)\nModal Cloud implementation."
-  },
-  {
-    "objectID": "docs/api/cli.cloud.modal_.html#functions",
-    "href": "docs/api/cli.cloud.modal_.html#functions",
-    "title": "cli.cloud.modal_",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nrun_cmd\nRun a command inside a folder, with Modal Volume reloading before and commit on success.\n\n\n\n\n\ncli.cloud.modal_.run_cmd(cmd, run_folder, volumes=None)\nRun a command inside a folder, with Modal Volume reloading before and commit on success."
-  },
-  {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html",
-    "title": "cli.merge_sharded_fsdp_weights",
-    "section": "",
-    "text": "cli.merge_sharded_fsdp_weights\nCLI to merge sharded FSDP model checkpoints into a single combined checkpoint.\n\n\n\n\n\nName\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\n\n\nName\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
-  },
-  {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#classes",
-    "title": "cli.merge_sharded_fsdp_weights",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nBFloat16CastPlanner\nA custom planner to cast tensors to bfloat16 on the fly during loading.\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.BFloat16CastPlanner()\nA custom planner to cast tensors to bfloat16 on the fly during loading."
-  },
-  {
-    "objectID": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "href": "docs/api/cli.merge_sharded_fsdp_weights.html#functions",
-    "title": "cli.merge_sharded_fsdp_weights",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\ndo_cli\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\nmerge_fsdp_weights\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.do_cli(config=Path('examples/'), **kwargs)\nParses axolotl config, CLI args, and calls merge_fsdp_weights.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\nconfig\nUnion[Path, str]\nPath to axolotl config YAML file.\nPath('examples/')\n\n\nkwargs\n\nAdditional keyword arguments to override config file values.\n{}\n\n\n\n\n\n\n\ncli.merge_sharded_fsdp_weights.merge_fsdp_weights(\n    checkpoint_dir,\n    output_path,\n    safe_serialization=False,\n    remove_checkpoint_dir=False,\n)\nMerge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if\nSHARDED_STATE_DICT was used for the model. Weights will be saved to {output_path}/model.safetensors if\nsafe_serialization else pytorch_model.bin.\nNote: this is a CPU-bound process.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncheckpoint_dir\nstr\nThe directory containing the FSDP checkpoints (can be either the model or optimizer).\nrequired\n\n\noutput_path\nstr\nThe path to save the merged checkpoint.\nrequired\n\n\nsafe_serialization\nbool, optional, defaults to True\nWhether to save the merged weights with safetensors (recommended).\nFalse\n\n\nremove_checkpoint_dir\nbool, optional, defaults to False\nWhether to remove the checkpoint directory after merging.\nFalse\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\n\nValueError\nIf torch version &lt; 2.3.0, or if checkpoint_dir does not exist."
-  },
-  {
-    "objectID": "docs/api/core.builders.base.html",
-    "href": "docs/api/core.builders.base.html",
-    "title": "core.builders.base",
-    "section": "",
-    "text": "core.builders.base\nBase class for trainer builder\n\n\n\n\n\nName\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
-  },
-  {
-    "objectID": "docs/api/core.builders.base.html#classes",
-    "href": "docs/api/core.builders.base.html#classes",
-    "title": "core.builders.base",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nTrainerBuilderBase\nBase class for trainer builder.\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase(cfg, model, tokenizer, processor=None)\nBase class for trainer builder.\n\n\n\n\n\nName\nDescription\n\n\n\n\nget_post_trainer_create_callbacks\nCallbacks added after the trainer is created, usually b/c these need access to the trainer\n\n\n\n\n\ncore.builders.base.TrainerBuilderBase.get_post_trainer_create_callbacks(trainer)\nCallbacks added after the trainer is created, usually b/c these need access to the trainer"
-  },
-  {
-    "objectID": "docs/api/loaders.adapter.html",
-    "href": "docs/api/loaders.adapter.html",
-    "title": "loaders.adapter",
-    "section": "",
-    "text": "loaders.adapter\nAdapter loading functionality, including LoRA / QLoRA and associated utils\n\n\n\n\n\nName\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
-  },
-  {
-    "objectID": "docs/api/loaders.adapter.html#functions",
-    "href": "docs/api/loaders.adapter.html#functions",
-    "title": "loaders.adapter",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nsetup_quantized_meta_for_peft\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\nsetup_quantized_peft_meta_for_training\nReplaces dummy quant_state.to method with the original function to allow training to continue\n\n\n\n\n\nloaders.adapter.setup_quantized_meta_for_peft(model)\nReplaces quant_state.to with a dummy function to prevent PEFT from moving quant_state to meta device\n\n\n\nloaders.adapter.setup_quantized_peft_meta_for_training(model)\nReplaces dummy quant_state.to method with the original function to allow training to continue"
-  },
-  {
-    "objectID": "docs/api/integrations.spectrum.args.html",
-    "href": "docs/api/integrations.spectrum.args.html",
-    "title": "integrations.spectrum.args",
-    "section": "",
-    "text": "integrations.spectrum.args\nModule for handling Spectrum input arguments.\n\n\n\n\n\nName\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
-  },
-  {
-    "objectID": "docs/api/integrations.spectrum.args.html#classes",
-    "href": "docs/api/integrations.spectrum.args.html#classes",
-    "title": "integrations.spectrum.args",
-    "section": "",
-    "text": "Name\nDescription\n\n\n\n\nSpectrumArgs\nInput args for Spectrum.\n\n\n\n\n\nintegrations.spectrum.args.SpectrumArgs()\nInput args for Spectrum."
-  },
-  {
-    "objectID": "docs/dataset_loading.html",
-    "href": "docs/dataset_loading.html",
-    "title": "Dataset Loading",
-    "section": "",
-    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
+    "text": "The below are three ways to train multi-node in Axolotl.",
     "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
+      "Deployments",
+      "Multi Node"
     ]
   },
   {
-    "objectID": "docs/dataset_loading.html#overview",
-    "href": "docs/dataset_loading.html#overview",
-    "title": "Dataset Loading",
+    "objectID": "docs/multi-node.html#accelerate",
+    "href": "docs/multi-node.html#accelerate",
+    "title": "Multi Node",
+    "section": "Accelerate",
+    "text": "Accelerate\nYou will need to create a configuration for accelerate, either by using accelerate config and follow the instructions or you can use one of the preset below:\n~/.cache/huggingface/accelerate/default_config.yaml\ncompute_environment: LOCAL_MACHINE\ndebug: false\ndistributed_type: FSDP\ndowncast_bf16: 'no'\nmachine_rank: 0 # Set to 0 for the main machine, increment by one for other machines\nmain_process_ip: 10.0.0.4 # Set to main machine's IP\nmain_process_port: 5000\nmain_training_function: main\nmixed_precision: bf16\nnum_machines: 2 # Change to the number of machines\nnum_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)\nrdzv_backend: static\nsame_network: true\ntpu_env: []\ntpu_use_cluster: false\ntpu_use_sudo: false\nuse_cpu: false\nConfigure your model to use FSDP in the Axolotl yaml. For example:\nfsdp_version: 2\nfsdp_config:\n  offload_params: true\n  state_dict_type: FULL_STATE_DICT\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: LlamaDecoderLayer\n  reshard_after_forward: true\nAll you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.",
+    "crumbs": [
+      "Deployments",
+      "Multi Node"
+    ]
+  },
+  {
+    "objectID": "docs/multi-node.html#raytrain",
+    "href": "docs/multi-node.html#raytrain",
+    "title": "Multi Node",
+    "section": "Raytrain",
+    "text": "Raytrain\nPlease see ray train doc here.",
+    "crumbs": [
+      "Deployments",
+      "Multi Node"
+    ]
+  },
+  {
+    "objectID": "docs/multi-node.html#torchrun",
+    "href": "docs/multi-node.html#torchrun",
+    "title": "Multi Node",
+    "section": "Torchrun",
+    "text": "Torchrun\nIf you are using Infiniband, we recommend torchrun to utilize the full bandwidth.\nSet the following env (change buffersize/socketname depending on your system):\nexport NCCL_IB_DISABLE=0\nexport NCCL_SOCKET_IFNAME=\"eth0,en,eth,em,bond\"\nexport NCCL_BUFFSIZE=2097152\nRun the following on each node:\n\nOption 1: New Axolotl CLI with launcher args (Recommended)\naxolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\"\n\n\nOption 2: Direct torchrun (Legacy)\ntorchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint \"$head_node_ip:$head_node_port\" -m axolotl.cli.train config.yaml\nPlease make sure to substitute the placeholder variables:\n\nnum_nodes: Number of nodes (containing GPUs)\ngpu_per_node: Number of gpus per node\nhead_node_ip: IP of the head node (make sure other machines can connect to this)\nhead_node_port: Port of the head node (make sure other machines can connect to this. Default 29400)\nrdzv_id: A unique job ID that is used by the job across nodes.\n\nThe new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.\nMore info on the available configs can be found on the Pytorch docs here",
+    "crumbs": [
+      "Deployments",
+      "Multi Node"
+    ]
+  },
+  {
+    "objectID": "docs/input_output.html",
+    "href": "docs/input_output.html",
+    "title": "Template-free prompt construction",
     "section": "",
-    "text": "Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
+    "text": "The documentation moved to here."
   },
   {
-    "objectID": "docs/dataset_loading.html#loading-datasets",
-    "href": "docs/dataset_loading.html#loading-datasets",
-    "title": "Dataset Loading",
-    "section": "Loading Datasets",
-    "text": "Loading Datasets\nWe use the datasets library to load datasets and a mix of load_dataset and load_from_disk to load them.\nYou may recognize the similar named configs between load_dataset and the datasets section of the config file.\ndatasets:\n  - path:\n    name:\n    data_files:\n    split:\n    revision:\n    trust_remote_code:\n\n\n\n\n\n\nTip\n\n\n\nDo not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be path and sometimes data_files.\n\n\nThis matches the API of datasets.load_dataset, so if you’re familiar with that, you will feel right at home.\nFor HuggingFace’s guide to load different dataset types, see here.\nFor full details on the config, see config-reference.qmd.\n\n\n\n\n\n\nNote\n\n\n\nYou can set multiple datasets in the config file by more than one entry under datasets.\ndatasets:\n  - path: /path/to/your/dataset\n  - path: /path/to/your/other/dataset\n\n\n\nLocal dataset\n\nFiles\nTo load a JSON file, you would do something like this:\nfrom datasets import load_dataset\n\ndataset = load_dataset(\"json\", data_files=\"data.json\")\nWhich translates to the following config:\ndatasets:\n  - path: data.json\n    ds_type: json\nIn the example above, it can be seen that we can just point the path to the file or directory along with the ds_type to load the dataset.\nThis works for CSV, JSON, Parquet, and Arrow files.\n\n\n\n\n\n\nTip\n\n\n\nIf path points to a file and ds_type is not specified, we will automatically infer the dataset type from the file extension, so you could omit ds_type if you’d like.\n\n\n\n\nDirectory\nIf you’re loading a directory, you can point the path to the directory.\nThen, you have two options:\n\nLoading entire directory\nYou do not need any additional configs.\nWe will attempt to load in the following order:\n- datasets saved with datasets.save_to_disk\n- loading entire directory of files (such as with parquet/arrow files)\ndatasets:\n  - path: /path/to/your/directory\n\n\nLoading specific files in directory\nProvide data_files with a list of files to load.\ndatasets:\n    # single file\n  - path: /path/to/your/directory\n    ds_type: csv\n    data_files: file1.csv\n\n    # multiple files\n  - path: /path/to/your/directory\n    ds_type: json\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n    # multiple files for parquet\n  - path: /path/to/your/directory\n    ds_type: parquet\n    data_files:\n      - file1.parquet\n      - file2.parquet\n\n\n\n\nHuggingFace Hub\nThe method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.\n\n\n\n\n\n\nNote\n\n\n\nIf you’re using a private dataset, you will need to enable the hf_use_auth_token flag in the root-level of the config file.\n\n\n\nFolder uploaded\nThis would mean that the dataset is a single file or file(s) uploaded to the Hub.\ndatasets:\n  - path: org/dataset-name\n    data_files:\n      - file1.jsonl\n      - file2.jsonl\n\n\nHuggingFace Dataset\nThis means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via datasets.push_to_hub.\ndatasets:\n  - path: org/dataset-name\n\n\n\n\n\n\nNote\n\n\n\nThere are some other configs which may be required like name, split, revision, trust_remote_code, etc depending on the dataset.\n\n\n\n\n\nRemote Filesystems\nVia the storage_options config under load_dataset, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.\n\n\n\n\n\n\nWarning\n\n\n\nThis is currently experimental. Please let us know if you run into any issues!\n\n\nThe only difference between the providers is that you need to prepend the path with the respective protocols.\ndatasets:\n    # Single file\n  - path: s3://bucket-name/path/to/your/file.jsonl\n\n    # Directory\n  - path: s3://bucket-name/path/to/your/directory\nFor directory, we load via load_from_disk.\n\nS3\nPrepend the path with s3://.\nThe credentials are pulled in the following order:\n\nAWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_SESSION_TOKEN environment variables\nfrom the ~/.aws/credentials file\nfor nodes on EC2, the IAM metadata provider\n\n\n\n\n\n\n\nNote\n\n\n\nWe assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.\n\n\nOther environment variables that can be set can be found in boto3 docs\n\n\nGCS\nPrepend the path with gs:// or gcs://.\nThe credentials are loaded in the following order:\n\ngcloud credentials\nfor nodes on GCP, the google metadata service\nanonymous access\n\n\n\nAzure\n\nGen 1\nPrepend the path with adl://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_TENANT_ID\nAZURE_STORAGE_CLIENT_ID\nAZURE_STORAGE_CLIENT_SECRET\n\n\n\nGen 2\nPrepend the path with abfs:// or az://.\nEnsure you have the following environment variables set:\n\nAZURE_STORAGE_ACCOUNT_NAME\nAZURE_STORAGE_ACCOUNT_KEY\n\nOther environment variables that can be set can be found in adlfs docs\n\n\n\nOCI\nPrepend the path with oci://.\nIt would attempt to read in the following order:\n\nOCIFS_IAM_TYPE, OCIFS_CONFIG_LOCATION, and OCIFS_CONFIG_PROFILE environment variables\nwhen on OCI resource, resource principal\n\nOther environment variables:\n\nOCI_REGION_METADATA\n\nPlease see the ocifs docs.\n\n\n\nHTTPS\nThe path should start with https://.\ndatasets:\n  - path: https://path/to/your/dataset/file.jsonl\nThis must be publically accessible.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/dataset_loading.html#next-steps",
-    "href": "docs/dataset_loading.html#next-steps",
-    "title": "Dataset Loading",
-    "section": "Next steps",
-    "text": "Next steps\nNow that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format dataset formats docs.",
-    "crumbs": [
-      "How To Guides",
-      "Dataset Loading"
-    ]
-  },
-  {
-    "objectID": "docs/multimodal.html",
-    "href": "docs/multimodal.html",
-    "title": "MultiModal / Vision Language Models (BETA)",
+    "objectID": "docs/ray-integration.html",
+    "href": "docs/ray-integration.html",
+    "title": "Ray Train",
     "section": "",
-    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nSmolVLM2\nLFM2-VL",
+    "text": "Axolotl supports using Ray as an alternative to accelerate for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.\nWith the --use-ray CLI flag, Axolotl will use Ray Train’s TorchTrainer to run training.",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Deployments",
+      "Ray Train"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#supported-models",
-    "href": "docs/multimodal.html#supported-models",
-    "title": "MultiModal / Vision Language Models (BETA)",
+    "objectID": "docs/ray-integration.html#ray-cluster-setup",
+    "href": "docs/ray-integration.html#ray-cluster-setup",
+    "title": "Ray Train",
+    "section": "Ray cluster setup",
+    "text": "Ray cluster setup\nA prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs here.\nEvery Ray cluster has one head node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this doc.",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#sanity-check",
+    "href": "docs/ray-integration.html#sanity-check",
+    "title": "Ray Train",
+    "section": "Sanity check",
+    "text": "Sanity check\nTo run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:\nray status\nThe output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:\nNode status\n---------------------------------------------------------------\nActive:\n 1 head\nIdle:\n 2 4xL40S:48CPU-384GB\nPending:\n (no pending nodes)\nRecent failures:\n (no failures)\n\nResources\n---------------------------------------------------------------\nUsage:\n 0.0/96.0 CPU\n 0.0/8.0 GPU\n 0B/800.00GiB memory\n 0B/229.57GiB object_store_memory\n\nDemands:\n (no resource demands)\nYou should also be able to see the same on the Ray dashboard.",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#configuring-training-with-ray-train",
+    "href": "docs/ray-integration.html#configuring-training-with-ray-train",
+    "title": "Ray Train",
+    "section": "Configuring training with Ray Train",
+    "text": "Configuring training with Ray Train\nYou can find an example configuration at configs/llama-3/lora-1b-ray.yaml.\nThe key parameters to note here are:\nuse_ray: true\nray_num_workers: 4\n# optional\nresources_per_worker:\n    GPU: 1\n\nuse_ray: This is the flag that enables the Ray Train integration. You can either use the corresponding --use-ray flag in the CLI or set use_ray in the config file.\nray_num_workers: This is the number of workers/GPUs to use for training.\nresources_per_worker: This is the Ray resource request for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do\n\nresources_per_worker:\n    accelerator_type:L40S: 0.001",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/ray-integration.html#launching-training",
+    "href": "docs/ray-integration.html#launching-training",
+    "title": "Ray Train",
+    "section": "Launching training",
+    "text": "Launching training\nYou can simply run the following command on the head node:\naxolotl train examples/llama-3/lora-1b-ray.yml --use-ray\nThis will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.\nYou can also monitor training progress on the Ray dashboard.\nComing back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let’s say you want to make use of all 8 GPUs. You would be able to just set ray_num_workers: 8 and run the previous command. The Cluster tab will show the following:\n\n\n\nRay dashboard",
+    "crumbs": [
+      "Deployments",
+      "Ray Train"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html",
+    "href": "docs/getting-started.html",
+    "title": "Quickstart",
     "section": "",
-    "text": "Mllama\nLlama4\nPixtral\nLlava-1.5\nMistral-Small-3.1\nVoxtral\nGemma-3\nGemma-3n\nQwen2-VL\nQwen2.5-VL\nSmolVLM2\nLFM2-VL",
+    "text": "This guide will walk you through your first model fine-tuning project with Axolotl.",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#usage",
-    "href": "docs/multimodal.html#usage",
-    "title": "MultiModal / Vision Language Models (BETA)",
-    "section": "Usage",
-    "text": "Usage\nMultimodal support is limited and doesn’t have full feature parity.\nHere are the hyperparams you’ll need to use to finetune a multimodal model.\nprocessor_type: AutoProcessor\n\nskip_prepare_dataset: true\nremove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training\nsample_packing: false  # not yet supported with multimodal\n\nchat_template:  # see in next section if specified\n\n# example dataset\ndatasets:\n  - path: HuggingFaceH4/llava-instruct-mix-vsft\n    type: chat_template\n    split: train[:1%]\n    field_messages: messages\n\n# (optional) if doing lora, only finetune the Language model,\n# leave the vision model and vision tower frozen\n# load_in_8bit: true\nadapter: lora\nlora_target_modules: 'model.language_model.layers.[\\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'\n\n# (optional) if you want to resize images to a set size\nimage_size: 512\nimage_resize_algorithm: bilinear\nPlease see examples folder for full configs.\n\n\n\n\n\n\nWarning\n\n\n\nSome of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.\n\n\n\nMllama\nbase_model: meta-llama/Llama-3.2-11B-Vision-Instruct\n\nchat_template: llama3_2_vision\n\n\nLlama4\nbase_model: meta-llama/Llama-4-Scout-17B-16E-Instruct\n\nchat_template: llama4\n\n\nPixtral\nbase_model: mistralai/Pixtral-12B-2409\n\nchat_template: pixtral\n\n\nLlava-1.5\nbase_model: llava-hf/llava-1.5-7b-hf\n\nchat_template: llava\n\n\nMistral-Small-3.1\nbase_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503\n\nchat_template: mistral_v7_tekken\n\n\nVoxtral\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install audio lib via pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'\n\n\nbase_model: mistralai/Voxtral-Mini-3B-2507\n\n\nGemma-3\n\n\n\n\n\n\nTip\n\n\n\nThe Gemma3-1B model is a text-only model, so please train as regular text model.\n\n\nFor multi-modal 4B/12B/27B models, use the following config:\nbase_model: google/gemma-3-4b-it\n\nchat_template: gemma3\n\n\nGemma-3n\n\n\n\n\n\n\nWarning\n\n\n\nThe model’s initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.\n\n\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install timm via pip3 install timm==1.0.17\n\n\nbase_model: google/gemma-3n-E2B-it\n\nchat_template: gemma3n\n\n\nQwen2-VL\nbase_model: Qwen/Qwen2-VL-7B-Instruct\n\nchat_template: qwen2_vl\n\n\nQwen2.5-VL\nbase_model: Qwen/Qwen2.5-VL-7B-Instruct\n\nchat_template: qwen2_vl  # same as qwen2-vl\n\n\nSmolVLM2\n\n\n\n\n\n\nTip\n\n\n\nPlease make sure to install num2words via pip3 install num2words==0.5.14\n\n\nbase_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct\n\n\nLFM2-VL\n\n\n\n\n\n\nWarning\n\n\n\nPlease uninstall causal-conv1d via pip3 uninstall -y causal-conv1d\n\n\nbase_model: LiquidAI/LFM2-VL-450M",
+    "objectID": "docs/getting-started.html#sec-quick-example",
+    "href": "docs/getting-started.html#sec-quick-example",
+    "title": "Quickstart",
+    "section": "1 Quick Example",
+    "text": "1 Quick Example\nLet’s start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.\nAssuming axolotl is installed (if not, see our Installation Guide)\n\nDownload example configs:\n\naxolotl fetch examples\n\nRun the training:\n\naxolotl train examples/llama-3/lora-1b.yml\nThat’s it! Let’s understand what just happened.",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#dataset-format",
-    "href": "docs/multimodal.html#dataset-format",
-    "title": "MultiModal / Vision Language Models (BETA)",
-    "section": "Dataset Format",
-    "text": "Dataset Format\nFor multi-modal datasets, we adopt an extended chat_template format similar to OpenAI’s Message format.\n\nA message is a list of role and content.\nrole can be system, user, assistant, etc.\ncontent is a list of type and (text, image, path, url, base64, or audio).\n\n\nImage\n\n\n\n\n\n\nNote\n\n\n\nFor backwards compatibility:\n\nIf the dataset has a images or image column of list[Image], it will be appended to the first content list as {\"type\": \"image\", \"image\": ...}. However, if the content already has a {\"type\": \"image\"} but no image key, it will be set the image key.\nIf content is a string, it will be converted to a list with type as text.\n\n\n\nFor image loading, you can use the following keys within content alongside \"type\": \"image\":\n\n\"path\": \"/path/to/image.jpg\"\n\"url\": \"https://example.com/image.jpg\"\n\"base64\": \"...\"\n\"image\": PIL.Image\n\n\n\nAudio\nFor audio loading, you can use the following keys within content alongside \"type\": \"audio\":\n\n\"path\": \"/path/to/audio.mp3\"\n\"url\": \"https://example.com/audio.mp3\"\n\"audio\": np.ndarray\n\n\n\n\n\n\n\nTip\n\n\n\nYou may need to install librosa via pip3 install librosa==0.11.0.\n\n\n\n\nVideo\n\n\n\n\n\n\nWarning\n\n\n\nThis is not well tested at the moment. We welcome contributors!\n\n\nFor video loading, you can use the following keys within content alongside \"type\": \"video\":\n\n\"path\": \"/path/to/video.mp4\"\n\"url\": \"https://example.com/video.mp4\"\n\"video\": np.ndarray | list[PIL.Image.Image] | torch.Tensor (or list of the aforementioned)\n\n\n\nExample\nHere is an example of a multi-modal dataset:\n[\n  {\n    \"messages\": [\n        {\n            \"role\": \"system\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"You are a helpful assistant.\"}\n              ]\n        },\n        {\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"image\", \"url\": \"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg\"},\n                {\"type\": \"text\", \"text\": \"Describe this image in detail.\"}\n            ]\n        },\n        {\n            \"role\": \"assistant\",\n            \"content\": [\n              {\"type\": \"text\", \"text\": \"The image is a bee.\"}\n            ]\n        }\n    ]\n  }\n]",
+    "objectID": "docs/getting-started.html#sec-understanding",
+    "href": "docs/getting-started.html#sec-understanding",
+    "title": "Quickstart",
+    "section": "2 Understanding the Process",
+    "text": "2 Understanding the Process\n\n2.1 The Configuration File\nThe YAML configuration file controls everything about your training. Here’s what (part of) our example config looks like:\nbase_model: NousResearch/Llama-3.2-1B\n\nload_in_8bit: true\nadapter: lora\n\ndatasets:\n  - path: teknium/GPT4-LLM-Cleaned\n    type: alpaca\ndataset_prepared_path: last_run_prepared\nval_set_size: 0.1\noutput_dir: ./outputs/lora-out\n\n\n\n\n\n\nTip\n\n\n\nload_in_8bit: true and adapter: lora enables LoRA adapter finetuning.\n\nTo perform Full finetuning, remove these two lines.\nTo perform QLoRA finetuning, replace with load_in_4bit: true and adapter: qlora.\n\n\n\nSee our config options for more details.\n\n\n2.2 Training\nWhen you run axolotl train, Axolotl:\n\nDownloads the base model\n(If specified) applies QLoRA/LoRA adapter layers\nLoads and processes the dataset\nRuns the training loop\nSaves the trained model and / or LoRA weights",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/multimodal.html#faq",
-    "href": "docs/multimodal.html#faq",
-    "title": "MultiModal / Vision Language Models (BETA)",
-    "section": "FAQ",
-    "text": "FAQ\n\nPIL.UnidentifiedImageError: cannot identify image file ...\n\nPIL could not retrieve the file at url using requests. Please check for typo. One alternative reason is that the request is blocked by the server.",
+    "objectID": "docs/getting-started.html#sec-custom",
+    "href": "docs/getting-started.html#sec-custom",
+    "title": "Quickstart",
+    "section": "3 Your First Custom Training",
+    "text": "3 Your First Custom Training\nLet’s modify the example for your own data:\n\nCreate a new config file my_training.yml:\n\nbase_model: NousResearch/Nous-Hermes-llama-1b-v1\n\nload_in_8bit: true\nadapter: lora\n\n# Training settings\nmicro_batch_size: 2\nnum_epochs: 3\nlearning_rate: 0.0003\n\n# Your dataset\ndatasets:\n  - path: my_data.jsonl        # Your local data file\n    type: alpaca               # Or other format\nThis specific config is for LoRA fine-tuning a model with instruction tuning data using\nthe alpaca dataset format, which has the following format:\n{\n    \"instruction\": \"Write a description of alpacas.\",\n    \"input\": \"\",\n    \"output\": \"Alpacas are domesticated South American camelids...\"\n}\nPlease see our Dataset Formats for more dataset formats and how to\nformat them.\n\nPrepare your JSONL data in the specified format (in this case, the expected alpaca\nformat):\n\n{\"instruction\": \"Classify this text\", \"input\": \"I love this!\", \"output\": \"positive\"}\n{\"instruction\": \"Classify this text\", \"input\": \"Not good at all\", \"output\": \"negative\"}\n\nRun the training:\n\naxolotl train my_training.yml",
     "crumbs": [
-      "How To Guides",
-      "MultiModal / Vision Language Models (BETA)"
+      "Getting Started",
+      "Quickstart"
     ]
   },
   {
-    "objectID": "docs/gradient_checkpointing.html",
-    "href": "docs/gradient_checkpointing.html",
-    "title": "Gradient Checkpointing and Activation Offloading",
+    "objectID": "docs/getting-started.html#sec-common-tasks",
+    "href": "docs/getting-started.html#sec-common-tasks",
+    "title": "Quickstart",
+    "section": "4 Common Tasks",
+    "text": "4 Common Tasks\n\n\n\n\n\n\nTip\n\n\n\nThe same yaml file is used for training, inference, and merging.\n\n\n\n4.1 Testing Your Model\nAfter training, test your model:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nMore details can be found in Inference.\n\n\n4.2 Using a UI\nLaunch a Gradio interface:\naxolotl inference my_training.yml --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n\n4.3 Preprocessing Data\nFor large datasets, preprocess first:\naxolotl preprocess my_training.yml\nPlease make sure to set dataset_prepared_path: in your config to set the path to save the prepared dataset.\nMore details can be found in Dataset Preprocessing.\n\n\n4.4 Merging LoRA weights\nTo merge the LoRA weights back into the base model, run:\naxolotl merge-lora my_training.yml --lora-model-dir=\"./outputs/lora-out\"\nThe merged model will be saved in the {output_dir}/merged directory.\nMore details can be found in Merging LoRA weights.",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/getting-started.html#sec-next-steps",
+    "href": "docs/getting-started.html#sec-next-steps",
+    "title": "Quickstart",
+    "section": "5 Next Steps",
+    "text": "5 Next Steps\nNow that you have the basics, you might want to:\n\nTry different model architectures\nExperiment with hyperparameters\nUse more advanced training methods\nScale up to larger models\n\nCheck our other guides for details on these topics:\n\nConfiguration Guide - Full configuration options\nDataset Loading - Loading datasets from various sources\nDataset Formats - Working with different data formats\nMulti-GPU Training\nMulti-Node Training",
+    "crumbs": [
+      "Getting Started",
+      "Quickstart"
+    ]
+  },
+  {
+    "objectID": "docs/optimizers.html",
+    "href": "docs/optimizers.html",
+    "title": "Optimizers",
     "section": "",
-    "text": "Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning\nmodels by reducing the memory footprint and improving computational efficiency.\n\nEnabling Gradient Checkpointing\ngradient_checkpointing: true\n\n\nEnabling Activation Offloading\ngradient_checkpointing: true  # required for activation offloading\nactivation_offloading: true\nActivation offloading variants:\nThe default activation_offloading: true offloads activations to CPU and uses CUDA streams\nto overlap the communications and computations when offloading.\nThe activation_offloading: legacy naively offloads activations to CPU and without additional optimizations.\nFor resource constrained environments with limited CPU memory, activation_offloading: disk offloads\nactivations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.",
+    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "crumbs": [
+      "Core Concepts",
+      "Optimizers"
+    ]
+  },
+  {
+    "objectID": "docs/optimizers.html#overview",
+    "href": "docs/optimizers.html#overview",
+    "title": "Optimizers",
+    "section": "",
+    "text": "Axolotl supports all optimizers supported by transformers OptimizerNames\nHere is a list of optimizers supported by transformers as of v4.54.0:\n\nadamw_torch\nadamw_torch_fused\nadamw_torch_xla\nadamw_torch_npu_fused\nadamw_apex_fused\nadafactor\nadamw_anyprecision\nadamw_torch_4bit\nadamw_torch_8bit\nademamix\nsgd\nadagrad\nadamw_bnb_8bit\nadamw_8bit # alias for adamw_bnb_8bit\nademamix_8bit\nlion_8bit\nlion_32bit\npaged_adamw_32bit\npaged_adamw_8bit\npaged_ademamix_32bit\npaged_ademamix_8bit\npaged_lion_32bit\npaged_lion_8bit\nrmsprop\nrmsprop_bnb\nrmsprop_bnb_8bit\nrmsprop_bnb_32bit\ngalore_adamw\ngalore_adamw_8bit\ngalore_adafactor\ngalore_adamw_layerwise\ngalore_adamw_8bit_layerwise\ngalore_adafactor_layerwise\nlomo\nadalomo\ngrokadamw\nschedule_free_radam\nschedule_free_adamw\nschedule_free_sgd\napollo_adamw\napollo_adamw_layerwise\nstable_adamw",
+    "crumbs": [
+      "Core Concepts",
+      "Optimizers"
+    ]
+  },
+  {
+    "objectID": "docs/optimizers.html#custom-optimizers",
+    "href": "docs/optimizers.html#custom-optimizers",
+    "title": "Optimizers",
+    "section": "Custom Optimizers",
+    "text": "Custom Optimizers\nEnable custom optimizers by passing a string to the optimizer argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below.\n\noptimi_adamw\noptimizer: optimi_adamw\n\n\nao_adamw_4bit\nDeprecated: Please use adamw_torch_4bit.\n\n\nao_adamw_8bit\nDeprecated: Please use adamw_torch_8bit.\n\n\nao_adamw_fp8\noptimizer: ao_adamw_fp8\n\n\nadopt_adamw\nGitHub: https://github.com/iShohei220/adopt\nPaper: https://arxiv.org/abs/2411.02853\noptimizer: adopt_adamw\n\n\ncame_pytorch\nGitHub: https://github.com/yangluo7/CAME/tree/master\nPaper: https://arxiv.org/abs/2307.02047\noptimizer: came_pytorch\n\n# optional args (defaults below)\nadam_beta1: 0.9\nadam_beta2: 0.999\nadam_beta3: 0.9999\nadam_epsilon: 1e-30\nadam_epsilon2: 1e-16\n\n\nmuon\nBlog: https://kellerjordan.github.io/posts/muon/\nPaper: https://arxiv.org/abs/2502.16982v1\noptimizer: muon\n\n\ndion\nMicrosoft’s Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient\northonormalizing optimizer that uses low-rank approximations to reduce gradient communication.\nGitHub: https://github.com/microsoft/dion\nPaper: https://arxiv.org/pdf/2504.05295\nNote: Implementation written for PyTorch 2.7+ for DTensor\noptimizer: dion\ndion_lr: 0.01\ndion_momentum: 0.95\nlr: 0.00001  # learning rate for embeddings and parameters that fallback to AdamW",
+    "crumbs": [
+      "Core Concepts",
+      "Optimizers"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html",
+    "href": "docs/multi-gpu.html",
+    "title": "Multi-GPU",
+    "section": "",
+    "text": "This guide covers advanced training configurations for multi-GPU setups using Axolotl.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-overview",
+    "href": "docs/multi-gpu.html#sec-overview",
+    "title": "Multi-GPU",
+    "section": "1 Overview",
+    "text": "1 Overview\nAxolotl supports several methods for multi-GPU training:\n\nDeepSpeed (recommended)\nFSDP (Fully Sharded Data Parallel)\nSequence parallelism\nFSDP + QLoRA",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-deepspeed",
+    "href": "docs/multi-gpu.html#sec-deepspeed",
+    "title": "Multi-GPU",
+    "section": "2 DeepSpeed",
+    "text": "2 DeepSpeed\n\n2.1 Configuration\nAdd to your YAML config:\ndeepspeed: deepspeed_configs/zero1.json\n\n\n2.2 Usage\n# Fetch deepspeed configs (if not already present)\naxolotl fetch deepspeed_configs\n\n# Passing arg via config\naxolotl train config.yml\n\n# Passing arg via cli\naxolotl train config.yml --deepspeed deepspeed_configs/zero1.json\n\n\n2.3 ZeRO Stages\nWe provide default configurations for:\n\nZeRO Stage 1 (zero1.json)\nZeRO Stage 1 with torch compile (zero1_torch_compile.json)\nZeRO Stage 2 (zero2.json)\nZeRO Stage 3 (zero3.json)\nZeRO Stage 3 with bf16 (zero3_bf16.json)\nZeRO Stage 3 with bf16 and CPU offload params(zero3_bf16_cpuoffload_params.json)\nZeRO Stage 3 with bf16 and CPU offload params and optimizer (zero3_bf16_cpuoffload_all.json)\n\n\n\n\n\n\n\nTip\n\n\n\nChoose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.\nStart from Stage 1 -&gt; Stage 2 -&gt; Stage 3.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-fsdp",
+    "href": "docs/multi-gpu.html#sec-fsdp",
+    "title": "Multi-GPU",
+    "section": "3 Fully Sharded Data Parallel (FSDP)",
+    "text": "3 Fully Sharded Data Parallel (FSDP)\n\n\n\n\n\n\nNote\n\n\n\nFSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.\n\n\n\n3.1 Migrating from FSDP1 to FSDP2\nTo migrate your config from FSDP1 to FSDP2, you must use the fsdp_version top-level config field to specify the FSDP version, and\nalso follow the config field mapping below to update field names.\n\n3.1.1 Config mapping\n\n\n\nFSDP1\nFSDP2\n\n\n\n\nfsdp_sharding_strategy\nreshard_after_forward\n\n\nfsdp_backward_prefetch_policy\nREMOVED\n\n\nfsdp_backward_prefetch\nREMOVED\n\n\nfsdp_forward_prefetch\nREMOVED\n\n\nfsdp_sync_module_states\nREMOVED\n\n\nfsdp_cpu_ram_efficient_loading\ncpu_ram_efficient_loading\n\n\nfsdp_state_dict_type\nstate_dict_type\n\n\nfsdp_use_orig_params\nREMOVED\n\n\n\nFor more details, please see the migration guide in the torchtitan repo. In Axolotl,\nif you were using the following FSDP1 config:\nfsdp_version: 1\nfsdp_config:\n  fsdp_offload_params: false\n  fsdp_cpu_ram_efficient_loading: true\n  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_sharding_strategy: FULL_SHARD\nYou can migrate to the following FSDP2 config:\nfsdp_version: 2\nfsdp_config:\n  offload_params: false\n  cpu_ram_efficient_loading: true\n  auto_wrap_policy: TRANSFORMER_BASED_WRAP\n  transformer_layer_cls_to_wrap: Qwen3DecoderLayer\n  state_dict_type: FULL_STATE_DICT\n  reshard_after_forward: true\n\n\n\n3.2 FSDP1 (deprecated)\n\n\n\n\n\n\nNote\n\n\n\nUsing fsdp to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use fsdp_config as above instead.\n\n\nfsdp:\n  - full_shard\n  - auto_wrap\nfsdp_config:\n  fsdp_offload_params: true\n  fsdp_state_dict_type: FULL_STATE_DICT\n  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-sequence-parallelism",
+    "href": "docs/multi-gpu.html#sec-sequence-parallelism",
+    "title": "Multi-GPU",
+    "section": "4 Sequence parallelism",
+    "text": "4 Sequence parallelism\nWe support sequence parallelism (SP) via the\nring-flash-attention project. This\nallows one to split up sequences across GPUs, which is useful in the event that a\nsingle sequence causes OOM errors during model training.\nSee our dedicated guide for more information.\n\n4.1 FSDP + QLoRA\nFor combining FSDP with QLoRA, see our dedicated guide.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-performance",
+    "href": "docs/multi-gpu.html#sec-performance",
+    "title": "Multi-GPU",
+    "section": "5 Performance Optimization",
+    "text": "5 Performance Optimization\n\n5.1 Liger Kernel Integration\nPlease see docs for more info.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/multi-gpu.html#sec-troubleshooting",
+    "href": "docs/multi-gpu.html#sec-troubleshooting",
+    "title": "Multi-GPU",
+    "section": "6 Troubleshooting",
+    "text": "6 Troubleshooting\n\n6.1 NCCL Issues\nFor NCCL-related problems, see our NCCL troubleshooting guide.\n\n\n6.2 Common Problems\n\nMemory IssuesTraining Instability\n\n\n\nReduce micro_batch_size\nReduce eval_batch_size\nAdjust gradient_accumulation_steps\nConsider using a higher ZeRO stage\n\n\n\n\nStart with DeepSpeed ZeRO-2\nMonitor loss values\nCheck learning rates\n\n\n\n\nFor more detailed troubleshooting, see our debugging guide.",
+    "crumbs": [
+      "Deployments",
+      "Multi-GPU"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_preprocessing.html",
+    "href": "docs/dataset_preprocessing.html",
+    "title": "Dataset Preprocessing",
+    "section": "",
+    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "crumbs": [
+      "Core Concepts",
+      "Dataset Preprocessing"
+    ]
+  },
+  {
+    "objectID": "docs/dataset_preprocessing.html#overview",
+    "href": "docs/dataset_preprocessing.html#overview",
+    "title": "Dataset Preprocessing",
+    "section": "",
+    "text": "Dataset pre-processing is the step where Axolotl takes each dataset you’ve configured alongside\nthe dataset format and prompt strategies to:\n\nparse the dataset based on the dataset format\ntransform the dataset to how you would interact with the model based on the prompt strategy\ntokenize the dataset based on the configured model & tokenizer\nshuffle and merge multiple datasets together if using more than one\n\nThe processing of the datasets can happen one of two ways:\n\nBefore kicking off training by calling axolotl preprocess config.yaml --debug\nWhen training is started\n\n\n\nWhen training interactively or for sweeps\n(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly\nslow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent\ntraining parameters so that it will intelligently pull from its cache when possible.\nThe path of the cache is controlled by dataset_prepared_path: and is often left blank in example\nYAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.\nIf dataset_prepared_path: is left empty, when training, the processed dataset will be cached in a\ndefault path of ./last_run_prepared/, but will ignore anything already cached there. By explicitly\nsetting dataset_prepared_path: ./last_run_prepared, the trainer will use whatever pre-processed\ndata is in the cache.\n\n\n\nLet’s say you are writing a custom prompt strategy or using a user-defined\nprompt template. Because the trainer cannot readily detect these changes, we cannot change the\ncalculated hash value for the pre-processed dataset.\nIf you have dataset_prepared_path: ... set\nand change your prompt templating logic, it may not pick up the changes you made and you will be\ntraining over the old prompt.",
+    "crumbs": [
+      "Core Concepts",
+      "Dataset Preprocessing"
+    ]
+  },
+  {
+    "objectID": "docs/torchao.html",
+    "href": "docs/torchao.html",
+    "title": "PyTorch ao",
+    "section": "",
+    "text": "To use experimental optimizers (AdamWFp8, AdamW4bit, AdamW8bit) from Pytorch Ao, please install the package as shown below.\n\n\n\n\n\n\nTip\n\n\n\nSome experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package!\n\n\n\nInstallation\nStable Release from the PyTorch index\npip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124\nNightly release\npip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124",
     "crumbs": [
       "Advanced Features",
-      "Gradient Checkpointing and Activation Offloading"
-    ]
-  },
-  {
-    "objectID": "docs/debugging.html",
-    "href": "docs/debugging.html",
-    "title": "Debugging",
-    "section": "",
-    "text": "This document provides some tips and tricks for debugging Axolotl. It also provides an example configuration for debugging with VSCode. A good debugging setup is essential to understanding how Axolotl code works behind the scenes.",
-    "crumbs": [
-      "Troubleshooting",
-      "Debugging"
-    ]
-  },
-  {
-    "objectID": "docs/debugging.html#table-of-contents",
-    "href": "docs/debugging.html#table-of-contents",
-    "title": "Debugging",
-    "section": "Table of Contents",
-    "text": "Table of Contents\n\nGeneral Tips\nDebugging with VSCode\n\nBackground\nConfiguration\nCustomizing your debugger\nVideo Tutorial\n\nDebugging With Docker\n\nSetup\nAttach To Container\nVideo - Attaching To Docker On Remote Host",
-    "crumbs": [
-      "Troubleshooting",
-      "Debugging"
-    ]
-  },
-  {
-    "objectID": "docs/debugging.html#general-tips",
-    "href": "docs/debugging.html#general-tips",
-    "title": "Debugging",
-    "section": "General Tips",
-    "text": "General Tips\nWhile debugging it’s helpful to simplify your test scenario as much as possible. Here are some tips for doing so:\n\n[!Important]\nAll of these tips are incorporated into the example configuration for debugging with VSCode below.\n\n\nMake sure you are using the latest version of axolotl: This project changes often and bugs get fixed fast. Check your git branch and make sure you have pulled the latest changes from main.\nEliminate concurrency: Restrict the number of processes to 1 for both training and data preprocessing:\n\nSet CUDA_VISIBLE_DEVICES to a single GPU, ex: export CUDA_VISIBLE_DEVICES=0.\nSet dataset_processes: 1 in your axolotl config or run the training command with --dataset_processes=1.\n\nUse a small dataset: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure sample_packing: False and eval_sample_packing: False to avoid errors. If you are in a pinch and don’t have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training. For example, to shard the dataset into 20 pieces, add the following to your axolotl config):\ndatasets:\n    ...\n    shards: 20\nUse a small model: A good example of a small model is TinyLlama/TinyLlama-1.1B-Chat-v1.0.\nMinimize iteration time: Make sure the training loop finishes as fast as possible, with these settings.\n\nmicro_batch_size: 1\nmax_steps: 1\nval_set_size: 0\n\nClear Caches: Axolotl caches certain steps and so does the underlying HuggingFace trainer. You may want to clear some of these caches when debugging.\n\nData preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in dataset_prepared_path: in your axolotl config. If you didn’t set this value, the default is last_run_prepared.\nHF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache HuggingFace cache, by deleting the appropriate ~/.cache/huggingface/datasets/... folder(s).\nThe recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run. This is demonstrated in the example configuration below.",
-    "crumbs": [
-      "Troubleshooting",
-      "Debugging"
-    ]
-  },
-  {
-    "objectID": "docs/debugging.html#debugging-with-vscode",
-    "href": "docs/debugging.html#debugging-with-vscode",
-    "title": "Debugging",
-    "section": "Debugging with VSCode",
-    "text": "Debugging with VSCode\n\nBackground\nThe below example shows how to configure VSCode to debug data preprocessing of the chat_template format. This is the format used when you have the following in your axolotl config:\ndatasets:\n  - path: &lt;path to your chat_template formatted dataset&gt; # example on HF Hub: fozziethebeat/alpaca_messages_2k_test\n    type: chat_template\n\n[!Important]\nIf you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files .vscode/launch.json and .vscode/tasks.json for an example configuration.\n\n\n[!Tip]\nIf you prefer to watch a video, rather than read, you can skip to the video tutorial below (but doing both is recommended).\n\n\n\nSetup\nMake sure you have an editable install of Axolotl, which ensures that changes you make to the code are reflected at runtime. Run the following commands from the root of this project:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\nRemote Hosts\nIf you developing on a remote host, you can easily use VSCode to debug remotely. To do so, you will need to follow this remote - SSH guide. You can also see the video below on Docker and Remote SSH debugging.\n\n\n\nConfiguration\nThe easiest way to get started is to modify the .vscode/launch.json file in this project. This is just an example configuration, so you may need to modify or copy it to suit your needs.\nFor example, to mimic the command cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml, you would use the below configuration1. Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to devtools and set the env variable HF_HOME to a temporary folder that is later partially deleted. This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.\n// .vscode/launch.json\n{\n    \"version\": \"0.2.0\",\n    \"configurations\": [\n        {\n            \"name\": \"Debug axolotl prompt - chat_template\",\n            \"type\": \"python\",\n            \"module\": \"accelerate.commands.launch\",\n            \"request\": \"launch\",\n            \"args\": [\n                \"-m\", \"axolotl.cli.train\", \"dev_chat_template.yml\",\n                // The flags below simplify debugging by overriding the axolotl config\n                // with the debugging tips above.  Modify as needed.\n                \"--dataset_processes=1\",      // limits data preprocessing to one process\n                \"--max_steps=1\",              // limits training to just one step\n                \"--batch_size=1\",             // minimizes batch size\n                \"--micro_batch_size=1\",       // minimizes batch size\n                \"--val_set_size=0\",           // disables validation\n                \"--sample_packing=False\",     // disables sample packing which is necessary for small datasets\n                \"--eval_sample_packing=False\",// disables sample packing on eval set\n                \"--dataset_prepared_path=temp_debug/axolotl_outputs/data\", // send data outputs to a temp folder\n                \"--output_dir=temp_debug/axolotl_outputs/model\" // send model outputs to a temp folder\n                ],\n            \"console\": \"integratedTerminal\",      // show output in the integrated terminal\n            \"cwd\": \"${workspaceFolder}/devtools\", // set working directory to devtools from the root of the project\n            \"justMyCode\": true,                   // step through only axolotl code\n            \"env\": {\"CUDA_VISIBLE_DEVICES\": \"0\",  // Since we aren't doing distributed training, we need to limit to one GPU\n                    \"HF_HOME\": \"${workspaceFolder}/devtools/temp_debug/.hf-cache\"}, // send HF cache to a temp folder\n            \"preLaunchTask\": \"cleanup-for-dataprep\", // delete temp folders (see below)\n        }\n    ]\n}\nAdditional notes about this configuration:\n\nThe argument justMyCode is set to true such that you step through only the axolotl code. If you want to step into dependencies, set this to false.\nThe preLaunchTask: cleanup-for-dataprep is defined in .vscode/tasks.json and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:\n\n./devtools/temp_debug/axolotl_outputs\n./devtools/temp_debug/.hf-cache/datasets\n\n\n\n[!Tip]\nYou may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the tasks.json file depending on your use case.\n\nBelow is the ./vscode/tasks.json file that defines the cleanup-for-dataprep task. This task is run before each debugging session when you use the above configuration. Note how there are two tasks that delete the two folders mentioned above. The third task cleanup-for-dataprep is a composite task that combines the two tasks. A composite task is necessary because VSCode does not allow you to specify multiple tasks in the preLaunchTask argument of the launch.json file.\n// .vscode/tasks.json\n// this file is used by launch.json\n{\n    \"version\": \"2.0.0\",\n    \"tasks\": [\n      // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder\n      {\n        \"label\": \"delete-outputs\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/axolotl_outputs\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n      // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder\n      {\n        \"label\": \"delete-temp-hf-dataset-cache\",\n        \"type\": \"shell\",\n        \"command\": \"rm -rf temp_debug/.hf-cache/datasets\",\n        \"options\":{ \"cwd\": \"${workspaceFolder}/devtools\"},\n        \"problemMatcher\": []\n      },\n        // this task combines the two tasks above\n      {\n       \"label\": \"cleanup-for-dataprep\",\n       \"dependsOn\": [\"delete-outputs\", \"delete-temp-hf-dataset-cache\"],\n      }\n    ]\n}\n\n\nCustomizing your debugger\nYour debugging use case may differ from the example above. The easiest thing to do is to put your own axolotl config in the devtools folder and modify the launch.json file to use your config. You may also want to modify the preLaunchTask to delete different folders or not delete anything at all.\n\n\nVideo Tutorial\nThe following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl w/VSCode",
-    "crumbs": [
-      "Troubleshooting",
-      "Debugging"
-    ]
-  },
-  {
-    "objectID": "docs/debugging.html#debugging-with-docker",
-    "href": "docs/debugging.html#debugging-with-docker",
-    "title": "Debugging",
-    "section": "Debugging With Docker",
-    "text": "Debugging With Docker\nUsing official Axolotl Docker images is a great way to debug your code, and is a very popular way to use Axolotl. Attaching VSCode to Docker takes a few more steps.\n\nSetup\nOn the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:\ngit clone https://github.com/axolotl-ai-cloud/axolotl\ncd axolotl\n\n[!Tip]\nIf you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.\n\nNext, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:2\ndocker run --privileged --gpus '\"all\"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src=\"${PWD}\",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1\n\n[!Tip]\nTo understand which containers are available, see the Docker section of the README and the DockerHub repo. For details of how the Docker containers are built, see axolotl’s Docker CI builds.\n\nYou will now be in the container. Next, perform an editable install of Axolotl:\npip3 install packaging\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAttach To Container\nNext, if you are using a remote host, Remote into this host with VSCode. If you are using a local host, you can skip this step.\nNext, select Dev Containers: Attach to Running Container... using the command palette (CMD + SHIFT + P) in VSCode. You will be prompted to select a container to attach to. Select the container you just created. You will now be in the container with a working directory that is at the root of the project. Any changes you make to the code will be reflected both in the container and on the host.\nNow you are ready to debug as described above (see Debugging with VSCode).\n\n\nVideo - Attaching To Docker On Remote Host\nHere is a short video that demonstrates how to attach to a Docker container on a remote host:\n\n\n\nHamel Husain’s tutorial: Debugging Axolotl Part 2: Attaching to Docker on a Remote Host",
-    "crumbs": [
-      "Troubleshooting",
-      "Debugging"
-    ]
-  },
-  {
-    "objectID": "docs/debugging.html#footnotes",
-    "href": "docs/debugging.html#footnotes",
-    "title": "Debugging",
-    "section": "Footnotes",
-    "text": "Footnotes\n\n\nThe config actually mimics the command CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml, but this is the same thing.↩︎\nMany of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit. You can read more about these flags here.↩︎",
-    "crumbs": [
-      "Troubleshooting",
-      "Debugging"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html",
-    "href": "docs/custom_integrations.html",
-    "title": "Custom Integrations",
-    "section": "",
-    "text": "Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.\nTo enable them, please check the respective documentations.",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#cut-cross-entropy",
-    "href": "docs/custom_integrations.html#cut-cross-entropy",
-    "title": "Custom Integrations",
-    "section": "Cut Cross Entropy",
-    "text": "Cut Cross Entropy\nCut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.\nSee https://github.com/apple/ml-cross-entropy\n\nRequirements\n\nPyTorch 2.4.0 or higher\n\n\n\nInstallation\nRun the following command to install cut_cross_entropy[transformers] if you don’t have it already.\n\nIf you are in dev environment\n\npython scripts/cutcrossentropy_install.py | sh\n\nIf you are installing from pip\n\npip3 uninstall -y cut-cross-entropy && pip3 install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5\"\n\n\nUsage\nplugins:\n  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n\n\nSupported Models\n\narcee\ncohere\ncohere2\ngemma\ngemma2\ngemma3\ngemma3_text\ngemma3n\ngemma3n_text\nglm\nglm4\ngpt_oss\ngranite\ngranitemoe\nhunyuan_v1_dense\nhunyuan_v1_moe\nllama\nllama4\nllama4_text\nmistral\nmistral3\nmixtral\nmllama\nphi\nphi3\nphi4_multimodal\nqwen2\nqwen2_vl\nqwen2_moe\nqwen2_5_vl\nqwen3\nqwen3_moe\nsmollm3\nvoxtral\n\n\n\nCitation\n@article{wijmans2024cut,\n  author       = {Erik Wijmans and\n                  Brody Huval and\n                  Alexander Hertzberg and\n                  Vladlen Koltun and\n                  Philipp Kr\\\"ahenb\\\"uhl},\n  title        = {Cut Your Losses in Large-Vocabulary Language Models},\n  journal      = {arXiv},\n  year         = {2024},\n  url          = {https://arxiv.org/abs/2411.09009},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#densemixer",
-    "href": "docs/custom_integrations.html#densemixer",
-    "title": "Custom Integrations",
-    "section": "DenseMixer",
-    "text": "DenseMixer\nSee DenseMixer\nSimply add the following to your axolotl YAML config:\nplugins:\n  - axolotl.integrations.densemixer.DenseMixerPlugin\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#grokfast",
-    "href": "docs/custom_integrations.html#grokfast",
-    "title": "Custom Integrations",
-    "section": "Grokfast",
-    "text": "Grokfast\nSee https://github.com/ironjr/grokfast\n\nUsage\nplugins:\n  - axolotl.integrations.grokfast.GrokfastPlugin\n\ngrokfast_alpha: 2.0\ngrokfast_lamb: 0.98\n\n\nCitation\n@article{lee2024grokfast,\n    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},\n    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},\n    journal={arXiv preprint arXiv:2405.20233},\n    year={2024}\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#knowledge-distillation-kd",
-    "href": "docs/custom_integrations.html#knowledge-distillation-kd",
-    "title": "Custom Integrations",
-    "section": "Knowledge Distillation (KD)",
-    "text": "Knowledge Distillation (KD)\n\nUsage\nplugins:\n  - \"axolotl.integrations.kd.KDPlugin\"\n\nkd_trainer: True\nkd_ce_alpha: 0.1\nkd_alpha: 0.9\nkd_temperature: 1.0\n\ntorch_compile: True  # torch&gt;=2.6.0, recommended to reduce vram\n\ndatasets:\n  - path: ...\n    type: \"axolotl.integrations.kd.chat_template\"\n    field_messages: \"messages_combined\"\n    logprobs_field: \"llm_text_generation_vllm_logprobs\"  # for kd only, field of logprobs\nAn example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#llmcompressor",
-    "href": "docs/custom_integrations.html#llmcompressor",
-    "title": "Custom Integrations",
-    "section": "LLMCompressor",
-    "text": "LLMCompressor\nFine-tune sparsified models in Axolotl using Neural Magic’s LLMCompressor.\nThis integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor’s model compression capabilities with Axolotl’s distributed training pipelines, users can efficiently fine-tune sparse models at scale.\nIt uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.\n\n\nRequirements\n\nAxolotl with llmcompressor extras:\npip install \"axolotl[llmcompressor]\"\nRequires llmcompressor &gt;= 0.5.1\n\nThis will install all necessary dependencies to fine-tune sparsified models using the integration.\n\n\n\nUsage\nTo enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:\nplugins:\n  - axolotl.integrations.llm_compressor.LLMCompressorPlugin\n\nllmcompressor:\n  recipe:\n    finetuning_stage:\n      finetuning_modifiers:\n        ConstantPruningModifier:\n          targets: [\n            're:.*q_proj.weight',\n            're:.*k_proj.weight',\n            're:.*v_proj.weight',\n            're:.*o_proj.weight',\n            're:.*gate_proj.weight',\n            're:.*up_proj.weight',\n            're:.*down_proj.weight',\n          ]\n          start: 0\n  save_compressed: true\nThis plugin does not apply pruning or sparsification itself — it is intended for fine-tuning models that have already been sparsified.\nPre-sparsified checkpoints can be:\n- Generated using LLMCompressor\n- Downloaded from Neural Magic’s Hugging Face page\n- Any custom LLM with compatible sparsity patterns that you’ve created yourself\nTo learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:\nhttps://github.com/vllm-project/llm-compressor/blob/main/README.md\n\n\nStorage Optimization with save_compressed\nSetting save_compressed: true in your configuration enables saving models in a compressed format, which:\n- Reduces disk space usage by approximately 40%\n- Maintains compatibility with vLLM for accelerated inference\n- Maintains compatibility with llmcompressor for further optimization (example: quantization)\nThis option is highly recommended when working with sparse models to maximize the benefits of model compression.\n\n\nExample Config\nSee examples/llama-3/sparse-finetuning.yaml for a complete example.\n\n\n\nInference with vLLM\nAfter fine-tuning your sparse model, you can leverage vLLM for efficient inference.\nYou can also use LLMCompressor to apply additional quantization to your fine-tuned\nsparse model before inference for even greater performance benefits.:\nfrom vllm import LLM, SamplingParams\n\nprompts = [\n    \"Hello, my name is\",\n    \"The president of the United States is\",\n    \"The capital of France is\",\n    \"The future of AI is\",\n]\nsampling_params = SamplingParams(temperature=0.8, top_p=0.95)\nllm = LLM(\"path/to/your/sparse/model\")\noutputs = llm.generate(prompts, sampling_params)\n\nfor output in outputs:\n    prompt = output.prompt\n    generated_text = output.outputs[0].text\n    print(f\"Prompt: {prompt!r}, Generated text: {generated_text!r}\")\nFor more details on vLLM’s capabilities and advanced configuration options, see the official vLLM documentation.\n\n\nLearn More\nFor details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:\nhttps://github.com/vllm-project/llm-compressor\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
-    "href": "docs/custom_integrations.html#language-model-evaluation-harness-lm-eval",
-    "title": "Custom Integrations",
-    "section": "Language Model Evaluation Harness (LM Eval)",
-    "text": "Language Model Evaluation Harness (LM Eval)\nRun evaluation on model using the popular lm-evaluation-harness library.\nSee https://github.com/EleutherAI/lm-evaluation-harness\n\nUsage\nplugins:\n  - axolotl.integrations.lm_eval.LMEvalPlugin\n\nlm_eval_tasks:\n  - gsm8k\n  - hellaswag\n  - arc_easy\n\nlm_eval_batch_size: # Batch size for evaluation\noutput_dir: # Directory to save evaluation results\n\n\nCitation\n@misc{eval-harness,\n  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},\n  title        = {A framework for few-shot language model evaluation},\n  month        = 07,\n  year         = 2024,\n  publisher    = {Zenodo},\n  version      = {v0.4.3},\n  doi          = {10.5281/zenodo.12608602},\n  url          = {https://zenodo.org/records/12608602}\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#liger-kernels",
-    "href": "docs/custom_integrations.html#liger-kernels",
-    "title": "Custom Integrations",
-    "section": "Liger Kernels",
-    "text": "Liger Kernels\nLiger Kernel provides efficient Triton kernels for LLM training, offering:\n\n20% increase in multi-GPU training throughput\n60% reduction in memory usage\nCompatibility with both FSDP and DeepSpeed\n\nSee https://github.com/linkedin/Liger-Kernel\n\nUsage\nplugins:\n  - axolotl.integrations.liger.LigerPlugin\nliger_rope: true\nliger_rms_norm: true\nliger_glu_activation: true\nliger_layer_norm: true\nliger_fused_linear_cross_entropy: true\n\n\nSupported Models\n\ndeepseek_v2\ngemma\ngemma2\ngemma3\ngranite\njamba\nllama\nmistral\nmixtral\nmllama\nmllama_text_model\nolmo2\npaligemma\nphi3\nqwen2\nqwen2_5_vl\nqwen2_vl\n\n\n\nCitation\n@article{hsu2024ligerkernelefficienttriton,\n      title={Liger Kernel: Efficient Triton Kernels for LLM Training},\n      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},\n      year={2024},\n      eprint={2410.10989},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2410.10989},\n      journal={arXiv preprint arXiv:2410.10989},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#spectrum",
-    "href": "docs/custom_integrations.html#spectrum",
-    "title": "Custom Integrations",
-    "section": "Spectrum",
-    "text": "Spectrum\nby Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar\nThis plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).\nSee https://github.com/cognitivecomputations/spectrum\n\nOverview\nSpectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.\nBy identifying the top n% of layers with the highest SNR, you can optimize training efficiency.\n\n\nUsage\nplugins:\n  - axolotl.integrations.spectrum.SpectrumPlugin\n\nspectrum_top_fraction: 0.5\nspectrum_model_name: meta-llama/Meta-Llama-3.1-8B\n\n\nCitation\n@misc{hartford2024spectrumtargetedtrainingsignal,\n      title={Spectrum: Targeted Training on Signal to Noise Ratio},\n      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},\n      year={2024},\n      eprint={2406.06623},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2406.06623},\n}\nPlease see reference here",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/custom_integrations.html#adding-a-new-integration",
-    "href": "docs/custom_integrations.html#adding-a-new-integration",
-    "title": "Custom Integrations",
-    "section": "Adding a new integration",
-    "text": "Adding a new integration\nPlugins can be used to customize the behavior of the training pipeline through hooks. See axolotl.integrations.BasePlugin for the possible hooks.\nTo add a new integration, please follow these steps:\n\nCreate a new folder in the src/axolotl/integrations directory.\nAdd any relevant files (LICENSE, README.md, ACKNOWLEDGEMENTS.md, etc.) to the new folder.\nAdd __init__.py and args.py files to the new folder.\n\n\n__init__.py should import the integration and hook into the appropriate functions.\nargs.py should define the arguments for the integration.\n\n\n(If applicable) Add CPU tests under tests/integrations or GPU tests under tests/e2e/integrations.\n\n\n\n\n\n\n\nTip\n\n\n\nSee src/axolotl/integrations/cut_cross_entropy for a minimal integration example.\n\n\n\n\n\n\n\n\nWarning\n\n\n\nIf you could not load your integration, please ensure you are pip installing in editable mode.\npip install -e .\nand correctly spelled the integration name in the config file.\nplugins:\n  - axolotl.integrations.your_integration_name.YourIntegrationPlugin\n\n\n\n\n\n\n\n\nNote\n\n\n\nIt is not necessary to place your integration in the integrations folder. It can be in any location, so long as it’s installed in a package in your python env.\nSee this repo for an example: https://github.com/axolotl-ai-cloud/diff-transformer",
-    "crumbs": [
-      "Advanced Features",
-      "Custom Integrations"
-    ]
-  },
-  {
-    "objectID": "docs/nd_parallelism.html",
-    "href": "docs/nd_parallelism.html",
-    "title": "N-D Parallelism (Beta)",
-    "section": "",
-    "text": "Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:\nor combinations of the above!",
-    "crumbs": [
-      "Advanced Features",
-      "N-D Parallelism (Beta)"
-    ]
-  },
-  {
-    "objectID": "docs/nd_parallelism.html#core-concepts",
-    "href": "docs/nd_parallelism.html#core-concepts",
-    "title": "N-D Parallelism (Beta)",
-    "section": "Core Concepts",
-    "text": "Core Concepts\nParallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch’s DeviceMesh is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.\n\nData Parallelism\nData Parallelism focuses on splitting the global data batch across GPUs.\n\nDistributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.\nFully Sharded Data Parallel (FSDP): A highly memory-efficient form of data parallelism (inspired by DeepSpeed’s ZeRO). Instead of replicating the model, FSDP shards the model’s parameters, gradients, and optimizer states across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an all_gather operation just before they are used, and they can be discarded immediately after (reshard-after-forward).\n\nFSDP maps to ZeRO stages:\n\nZeRO-2 (reshard_after_forward=False): Shards gradients and optimizer states. Model weights are replicated on each GPU.\nZeRO-3 (reshard_after_forward=True): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).\n\n\n\n\n\n[Experimental] Tensor Parallelism (TP)\nAlso known as “horizontal model parallelism,” as described in the Megatron-LM paper. Instead of splitting the batch, TP splits the model’s layers themselves across GPUs.\n\nHow it works: For a linear layer Y = XA, the weight matrix A is split column-wise (A = [A_1, A_2]). The computation becomes Y_1 = XA_1 and Y_2 = XA_2, which can happen in parallel on different GPUs. The final output Y is simply the concatenation of Y_1 and Y_2. Check this comment for more detailed info.\nRequirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.\n\n\n\nContext Parallelism (CP)\nContext Parallelism, also called Sequence Parallelism, addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.\n\nHow it works: If you have a sequence of 8192 tokens and a context_parallel_size of 4, each GPU will only handle a chunk of 2048 tokens.\nThe Challenge: Attention is not local; every token needs to “attend to” every other token. Splitting the sequence breaks this.\nThe Solution (ring-flash-attention): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a “ring.” After N-1 steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized flash-attention kernel at each step.\n\n\n\nHybrid Sharding Data Parallel (HSDP)\nHSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.\n\nIntra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the all_gather operations for sharded parameters fast.\nInter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP’s parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).\nExample: With 2 nodes of 8 GPUs each (16 total), you could have dp_shard_size=8 (FSDP within each node) and dp_replicate_size=2 (DDP across the two nodes).",
-    "crumbs": [
-      "Advanced Features",
-      "N-D Parallelism (Beta)"
-    ]
-  },
-  {
-    "objectID": "docs/nd_parallelism.html#usage",
-    "href": "docs/nd_parallelism.html#usage",
-    "title": "N-D Parallelism (Beta)",
-    "section": "Usage",
-    "text": "Usage\n# FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp\nfsdp_version: 2\nfsdp_config:\n  # ...\n\n# The number of GPUs to shard the model parameters across (FSDP dimension).\ndp_shard_size: 4\n\n# The number of times to replicate the sharded model (DDP dimension).\ndp_replicate_size: 2\n\n# Number of GPUs for Tensor Parallelism.\ntensor_parallel_size: 1  # (default is 1, no TP)\n\n# Number of GPUs for Context/Sequence Parallelism.\ncontext_parallel_size: 1 # (default is 1, no CP)\nNote: We recommend FSDP. DeepSpeed is only compatible with tensor_parallel_size.",
-    "crumbs": [
-      "Advanced Features",
-      "N-D Parallelism (Beta)"
-    ]
-  },
-  {
-    "objectID": "docs/nd_parallelism.html#examples",
-    "href": "docs/nd_parallelism.html#examples",
-    "title": "N-D Parallelism (Beta)",
-    "section": "Examples",
-    "text": "Examples\n\n\n\n\n\n\nTip\n\n\n\nSee our example configs here.\n\n\n\nHSDP on 2 nodes with 4 GPUs each (8 GPUs total):\n\nYou want FSDP within each node and DDP across nodes.\nSet dp_shard_size: 4 and dp_replicate_size: 2.\n\nFSDP + TP on a single 8-GPU node:\n\nYou want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.\nSet dp_shard_size: 4 and tensor_parallel_size: 2.\n\nFSDP + CP on a single 8-GPU node for long context:\n\nYou want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.\nSet dp_shard_size: 8 and context_parallel_size: 8. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.",
-    "crumbs": [
-      "Advanced Features",
-      "N-D Parallelism (Beta)"
-    ]
-  },
-  {
-    "objectID": "docs/nd_parallelism.html#support-matrix",
-    "href": "docs/nd_parallelism.html#support-matrix",
-    "title": "N-D Parallelism (Beta)",
-    "section": "Support Matrix",
-    "text": "Support Matrix\nThis matrix describes how different parallelism methods can be combined in Axolotl.\n\n\n\n\n\n\n\n\n\n\n\nCombination\ndp_replicate_size\ndp_shard_size\ntp_size\ncp_size\nStatus & Notes\n\n\n\n\nFSDP (ZeRO-3)\n1\n&gt;1\n1\n1\n✅ Fully supported. Shards model across all GPUs.\n\n\nHSDP\n&gt;1\n&gt;1\n1\n1\n✅ Fully supported. FSDP intra-node, DDP inter-node.\n\n\nFSDP + TP\n1\n&gt;1\n&gt;1\n1\n✅ 2D Parallelism. Shards the model across a dp_shard group, and TP-splits layers within the tp group.\n\n\nHSDP + TP\n&gt;1\n&gt;1\n&gt;1\n1\n✅ 3D Parallelism. A powerful but complex combination.\n\n\nFSDP + CP\n1\n&gt;1\n1\n&gt;1\n✅ 2D Parallelism. Combines FSDP with context parallelism.\n\n\nFSDP + TP + CP\n1\n&gt;1\n&gt;1\n&gt;1\n✅ 3D Parallelism. Another advanced combination.\n\n\nDDP + TP/CP\n&gt;1\n1\n&gt;1\n&gt;1\n❌ Not Supported. The ParallelismConfig explicitly prevents this, as composing pure DDP with TP or CP is currently not supported. You should use FSDP + TP/CP instead (dp_shard_size &gt; 1).\n\n\nJust TP / CP\n1\n1\n&gt;1\n&gt;1\n✅ Supported. Useful for inference or when the model fits on one GPU but context is too long.\n\n\n\n\ntp_size refers to tensor_parallel_size\ncp_size refers to context_parallel_size",
-    "crumbs": [
-      "Advanced Features",
-      "N-D Parallelism (Beta)"
-    ]
-  },
-  {
-    "objectID": "docs/quantize.html",
-    "href": "docs/quantize.html",
-    "title": "Quantization with torchao",
-    "section": "",
-    "text": "Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the torchao library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).",
-    "crumbs": [
-      "How To Guides",
-      "Quantization with torchao"
-    ]
-  },
-  {
-    "objectID": "docs/quantize.html#configuring-quantization-in-axolotl",
-    "href": "docs/quantize.html#configuring-quantization-in-axolotl",
-    "title": "Quantization with torchao",
-    "section": "Configuring Quantization in Axolotl",
-    "text": "Configuring Quantization in Axolotl\nQuantization is configured using the quantization key in your configuration file.\nbase_model: # The path to the model to quantize.\nquantization:\n  weight_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8\n  activation_dtype: # Optional[str] = \"int8\". Fake quantization layout to use for activation quantization. Valid options are \"int4\" and \"int8\"\n  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization\n  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.\n\noutput_dir:  # The path to the output directory.\nOnce quantization is complete, your quantized model will be saved in the {output_dir}/quantized directory.\nYou may also use the quantize command to quantize a model which has been trained with QAT - you can do this by using the existing QAT configuration file which\nyou used to train the model:\n# qat.yml\nqat:\n  activation_dtype: int8\n  weight_dtype: int8\n  group_size: 256\n  quantize_embedding: true\n\noutput_dir: # The path to the output directory used during training where the final checkpoint has been saved.\naxolotl quantize qat.yml\nThis ensures that an identical quantization configuration is used to quantize the model as was used to train it.",
-    "crumbs": [
-      "How To Guides",
-      "Quantization with torchao"
-    ]
-  },
-  {
-    "objectID": "docs/nccl.html",
-    "href": "docs/nccl.html",
-    "title": "NCCL",
-    "section": "",
-    "text": "NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several environment variables. A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:\nWatchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.\nOften, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends disabling PCI access control services (ACS) as a possible solution if this is available to you.\nForcing cross-GPU communication via NVLink may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:\nnvidia-smi nvlink --status\nTo force NCCL to use NVLink, simply set this in the environment:\nexport NCCL_P2P_LEVEL=NVL\nIf NVLink is not available in your environment there are other options for NCCL_P2P_LEVEL in the table below:\n\n\n\n\n\n\n\nNCCL_P2P_LEVEL\nDescription\n\n\n\n\nPIX\nP2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication.\n\n\nPXB\nP2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency.\n\n\nPHB\nP2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL)\n\n\n\nTo validate that acceptable data transfer speeds exist for your training job, running NCCL Tests can help pinpoint bottlenecks, for example:\n./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3\nIt can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:\nexport NCCL_DEBUG=INFO\nexport NCCL_DEBUG_SUBSYS=ALL\nexport TORCH_DISTRIBUTED_DEBUG=INFO\nexport TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log\nFinally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ddp_timeout value in the Axolotl configuration. See PyTorch init_process_group for documentation on this value.",
-    "crumbs": [
-      "Troubleshooting",
-      "NCCL"
+      "PyTorch ao"
     ]
   },
   {
@@ -3774,7 +3840,7 @@
     "href": "docs/config-reference.html",
     "title": "Config Reference",
     "section": "",
-    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization. Valid options are\n  # \"int4\" and \"int8\"\n  activation_dtype: TorchIntDType | None\n  # Fake quantization layout to use for weight quantization. Valid options are \"int4\"\n  # and \"int8\"\n  weight_dtype: TorchIntDType = TorchIntDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization. Valid options are uintX for\n  # X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8\n  weight_dtype: TorchIntDType = TorchIntDType.int8\n  # Fake quantization layout to use for activation quantization. Valid options are\n  # \"int4\" and \"int8\"\n  activation_dtype: TorchIntDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_use_logits_to_keep: bool | None\ndpo_label_smoothing: float | None\ndpo_norm_loss: bool | None\ndpo_padding_free: bool | None\ndpo_generate_during_eval: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Approximate number of predictions sent to wandb depending on batch size. Enabled above\n# 0. Default is 0\neval_table_size: int | None\n# Total number of tokens generated for predictions sent to wandb. Default is 128\neval_max_new_tokens: int | None\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# Use CUDA tf32 - require &gt;=ampere\ntf32: bool | None\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len. Defaults to 'drop' for backward\n# compatibility.\nexcess_length_strategy: Literal['drop', 'truncate'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int = 512\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None = 10000\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: dict[str, Any] | None\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second during training by measuring throughput of\n# non-padding tokens.\ninclude_tkps: bool | None\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Weighting of NLL term in loss from RPO paper\nrpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# Save model as safetensors (require safetensors package). Default True\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in\n# original model\nadapter: str | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None",
+    "text": "# Allow overwrite yml config using from cli\nstrict: bool | None = False\n# Resume from a specific checkpoint dir\nresume_from_checkpoint: str | None\n# If resume_from_checkpoint isn't set and you simply want it to start where it left off.\n# Be careful with this being turned on between different models.\nauto_resume_from_checkpoints: bool | None\n# Resize the model embeddings when new tokens are added to multiples of 32. This is\n# reported to improve training speed on some models\nresize_token_embeddings_to_32x: bool | None\nmean_resizing_embeddings: bool | None = False\n\n# Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.\nshrink_embeddings: bool | None\n# Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs\nembeddings_skip_upcast: bool | None\n\n# module to custom trainer class to use for training\ntrainer_cls: str | None\n\n# Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'\nrl: RLType | None\n\ntrl: TRLConfig | None\n  # For TRLConfig:\n  # Beta parameter for the RL training. Same as `rl_beta`. Use\n  beta: float | None\n  # Maximum length of the completion for RL training.\n  max_completion_length: int | None\n\n  # Whether to use VLLM for RL training.\n  use_vllm: bool = False\n  # VLLM mode to use, one of 'server' or 'colocate'\n  vllm_mode: Literal['server', 'colocate'] | None\n  # Host of the vLLM server to connect to.\n  vllm_server_host: str | None = 0.0.0.0\n  # Port of the vLLM server to connect to.\n  vllm_server_port: int | None = 8000\n  # Total timeout (in seconds) to wait for the vLLM server to respond.\n  vllm_server_timeout: int | None\n  # Regex for vLLM guided decoding.\n  vllm_guided_decoding_regex: str | None\n\n  # List of reward functions to load. Paths must be importable from current dir.\n  reward_funcs: list[str] | None\n  # List of reward weights for the reward functions.\n  reward_weights: list[float] | None\n  # Number of generations to sample.\n  num_generations: int | None\n  # Whether to log completions.\n  log_completions: bool | None = False\n  # Number of completions to print when log_completions is True.\n  num_completions_to_print: int | None\n  # Controls whether importance sampling ratios are computed at the `'token'` or\n  # `'sequence'` level. For GSPO, use `sequence`, default is None which corresponds to\n  # the original GRPO paper.\n  importance_sampling_level: Literal['sequence', 'token'] | None\n\n  # Whether to sync the reference model.\n  sync_ref_model: bool | None = False\n  # Mixup alpha for the reference model.\n  ref_model_mixup_alpha: float | None = 0.9\n  # Sync steps for the reference model.\n  ref_model_sync_steps: int | None = 64\n  # Whether to scale rewards by their standard deviation.\n  scale_rewards: bool = True\n\n  # Sampling temperature for the GRPO policy.\n  temperature: float | None\n  # Top-p sampling probability for the generation policy.\n  top_p: float | None\n  # Top-k sampling for the generation policy.\n  top_k: int | None\n  # Minimum probability for the generation policy.\n  min_p: float | None\n  # Penalty for tokens that appear in prompt and generated text.\n  repetition_penalty: float | None\n  # Number of iterations per batch (μ) for GRPO.\n  num_iterations: int | None\n  # Epsilon value for clipping in the GRPO algorithm.\n  epsilon: float | None\n  # Upper-bound epsilon value for clipping in the GRPO algorithm.\n  epsilon_high: float | None\n  # Whether to use Liger loss for GRPO.\n  use_liger_loss: bool | None\n  # Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.\n  loss_type: str | None\n  # Whether to exclude truncated completions from loss calculation.\n  mask_truncated_completions: bool = False\n\nvllm: VllmConfig | None\n  # For VllmConfig:\n  # Device to use for VLLM\n  device: str | None = auto\n  # Tensor parallel size for VLLM\n  tensor_parallel_size: int | None\n  # Data parallel size for VLLM\n  data_parallel_size: int | None\n  # GPU memory utilization for VLLM\n  gpu_memory_utilization: float | None = 0.9\n  # Data type for VLLM\n  dtype: str | None = auto\n  # Maximum length of the model context for VLLM\n  max_model_len: int | None\n  # Enable prefix caching for VLLM\n  enable_prefix_caching: bool | None\n  # Host for the vLLM server to start on\n  host: str | None = 0.0.0.0\n  # Port of the vLLM server to start on\n  port: int | None = 8000\n\n  # Enable reasoning for VLLM\n  enable_reasoning: bool | None\n  # Reasoning parser for VLLM\n  reasoning_parser: str | None\n\nqat: QATConfig | None\n  # For QATConfig:\n  # Fake quantization layout to use for activation quantization. Valid options are\n  # \"int4\" and \"int8\"\n  activation_dtype: TorchIntDType | None\n  # Fake quantization layout to use for weight quantization. Valid options are \"int4\"\n  # and \"int8\"\n  weight_dtype: TorchIntDType = TorchIntDType.int8\n  # Quantize embedding\n  quantize_embedding: bool | None = False\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n  # The number of steps to apply fake quantization after\n  fake_quant_after_n_steps: int | None\n\nquantization: PTQConfig | None\n  # For PTQConfig:\n  # Fake quantization layout to use for weight quantization. Valid options are uintX for\n  # X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8\n  weight_dtype: TorchIntDType = TorchIntDType.int8\n  # Fake quantization layout to use for activation quantization. Valid options are\n  # \"int4\" and \"int8\"\n  activation_dtype: TorchIntDType | None\n  # Whether to quantize the embedding layer.\n  quantize_embedding: bool | None\n  # The number of elements in each group for per-group fake quantization\n  group_size: int | None = 32\n\n# Reward modelling: `True` or `False`\nreward_model: bool | None\n# Process reward modelling: `True` or `False`\nprocess_reward_model: bool | None\nnum_labels: int | None\n\n# Whether to perform weighting in DPO trainer\ndpo_use_weighting: bool | None\ndpo_use_logits_to_keep: bool | None\ndpo_label_smoothing: float | None\ndpo_norm_loss: bool | None\ndpo_padding_free: bool | None\ndpo_generate_during_eval: bool | None\n\n# A list of one or more datasets to finetune the model with\ndatasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# A list of one or more datasets to eval the model with. You can use either\n# test_datasets, or val_set_size, but not both.\ntest_datasets: Annotated[list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset], MinLen(1)] | None\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n  # For DPODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedDPOType | str | None\n    # For UserDefinedDPOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_chosen: str | None\n    field_rejected: str | None\n    prompt_format: str | None\n    chosen_format: str | None\n    rejected_format: str | None\n  data_files: list[str] | None\n  revision: str | None\n  field_messages: str | None\n\n  # For KTODataset:\n  path: str | None\n  split: str | None\n  type: UserDefinedKTOType | str | None\n    # For UserDefinedKTOType:\n    field_system: str | None\n    field_prompt: str | None\n    field_completion: str | None\n    field_label: bool | None\n    prompt_format: str | None\n    completion_format: str | None\n  data_files: list[str] | None\n  trust_remote_code: bool | None = False\n  revision: str | None\n\n  # For StepwiseSupervisedDataset:\n  path: str | None\n  split: str | None\n  data_files: list[str] | None\n  revision: str | None\n  step_separator: str | None\n  max_completion_length: int | None\n  train_on_last_step_only: bool | None\n\n# If false, the datasets will not be shuffled and will keep their original order in\n# `datasets`. The same applies to the `test_datasets` option and the\n# `pretraining_dataset` option. Default is true.\nshuffle_merged_datasets: bool | None = True\n# If true, each dataset in `datasets` will be shuffled before merging. This allows\n# curriculum learning strategies to be applied at the dataset level. Default is false.\nshuffle_before_merging_datasets: bool | None = False\n# Axolotl attempts to save the dataset as an arrow after packing the data together so\n# subsequent training attempts load faster, relative path\ndataset_prepared_path: str | None\n# Num shards for whole dataset\ndataset_shard_num: int | None\n# Index of shard to use for whole dataset\ndataset_shard_idx: int | None\nskip_prepare_dataset: bool | None = False\n# Number of shards to save the prepared dataset\nnum_dataset_shards_to_save: int | None\n\n# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize\npretraining_dataset: Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None\n  # For PretrainingDataset:\n  name: str | None\n  path: str | None\n  split: str | None = train\n  text_column: str | None = text\n  type: str | None = pretrain\n  trust_remote_code: bool | None = False\n  data_files: str | None\n  skip: int | None\n\n  # For SFTDataset:\n  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory\n  path: str | None\n  # name of dataset split to load from\n  split: str | None\n  # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]\n  type: str | UserDefinedPrompterType | None\n    # For UserDefinedPrompterType:\n    # Custom user instruction prompt\n    system_prompt: str | None\n    # Use {system} as key to be replaced\n    system_format: str | None\n    field_system: str | None\n    field_instruction: str | None\n    field_input: str | None\n    field_output: str | None\n\n    # Customizable to be single line or multi-line. Use {instruction}/{input} as key to\n    # be replaced. 'format' can include {input}\n    format: str | None\n    # 'no_input_format' cannot include {input}\n    no_input_format: str | None\n  input_transform: str | None\n  # split dataset into N pieces (use with shards_idx)\n  shards: int | None\n  # the index of sharded dataset to use\n  shards_idx: int | None\n  # process dataset in N sequential chunks for memory efficiency (exclusive with\n  # `shards`)\n  preprocess_shards: int | None\n  conversation: str | None\n\n  # The name of the chat template to use for training, following values are supported:\n  # tokenizer_default: Uses the chat template that is available in the\n  # tokenizer_config.json. If the chat template is not available in the tokenizer, it\n  # will raise an error. This is the default.\n  # alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n  # are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n  # tokenizer_default_fallback_*: where * is the name of the chat template to fallback\n  # to if the tokenizer does not have a chat template else default to tokenizer. E.g.\n  # tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat\n  # template. The custom jinja template should be provided in the chat_template_jinja\n  # field.\n  chat_template: ChatTemplate | str | None\n  # Custom jinja chat template or path to jinja file. Used only if `chat_template:\n  # jinja` or empty.\n  chat_template_jinja: str | None\n  # path to source data files\n  data_files: str | list[str] | None\n  input_format: str | None\n  # name of dataset configuration to load\n  name: str | None\n  # defines the datatype when path is a file\n  ds_type: str | None\n  # For `completion` datasets only, uses the provided field instead of `text` column\n  field: str | None\n  field_human: str | None\n  field_model: str | None\n  # Key containing the messages (default: \"messages\")\n  field_messages: str | None\n  # Key containing the tools (default: \"tools\"). Must be a list[dict] and follow [JSON\n  # schema](https://json-schema.org/learn/getting-started-step-by-step).\n  field_tools: str | None\n  # Key containing the reasoning trace (default: \"reasoning_content\").\n  field_thinking: str | None\n  # The key the chat template expects that indicates the reasoning trace.\n  template_thinking_key: str | None\n\n  message_field_role: str | None\n\n  message_field_content: str | None\n  # Mapping of properties from the input dataset to the chat template. (default:\n  # message_property_mappings={'role':'role', 'content':'content'}) If a property exists\n  # in the template but not in this mapping, the system will attempt to load it directly\n  # from the message using the property name as the key. Example: In the mapping below,\n  # 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and\n  # used as 'content' in the chat template.\n  message_property_mappings: dict[str, str] | None\n  # The key in the message turn that indicates via boolean whether tokens of a turn\n  # should be considered for training. Useful to selectively train on certain turns\n  # besides the `roles_to_train`.\n  message_field_training: str | None\n  # The key in the message turn that contains the training details. Useful to\n  # selectively train on certain tokens in a turn. The value of the key is a List[Dict]\n  # containing `begin_offset` (start character index in content), `end_offset` (end\n  # character index in content), and `train` (boolean whether to train).\n  message_field_training_detail: str | None\n  # (for Qwen3 template only) Whether to split the assistant content based on a\n  # reasoning trace inside delimited tags\n  split_thinking: bool | None\n  logprobs_field: str | None\n  temperature: float | None\n  # Roles to train on. The tokens from these roles will be considered for the loss.\n  roles_to_train: list[str] | None\n  # Which EOS tokens to train on in the conversation. Possible values are: all: train on\n  # all EOS tokens, turn (default): train on the EOS token at the end of each trainable\n  # turn, last: train on the last EOS token in the conversation\n  train_on_eos: Literal['all', 'turn', 'last'] | None\n  # Roles mapping in the messages. The format is {target_role: [source_roles]}. All\n  # source roles will be mapped to the target role. The default is: user: [\"human\",\n  # \"user\"], assistant: [\"gpt\", \"assistant\"], system: [\"system\"], tool: [\"tool\"]\n  roles: dict[str, list[str]] | None\n  # Whether to drop the system turn from the dataset. Only works with chat_template.\n  # This does not drop the default system message from chat_template if it exists. If\n  # you wish to, we recommend using a custom jinja template with the default system\n  # message removed or adding a system turn with empty content.\n  drop_system_message: bool | None\n  # Trust remote code for untrusted source\n  trust_remote_code: bool | None = False\n  # The specific revision of the dataset to use when loading from the Hugging Face Hub.\n  # This can be a commit hash, tag, or branch name. If not specified, the latest version\n  # will be used. This parameter is ignored for local datasets.\n  revision: str | None\n\n# The maximum number of processes to use while preprocessing your input dataset. This\n# defaults to `os.cpu_count()` if not set. For Runpod VMs, it will default to number of\n# vCPUs via RUNPOD_CPU_COUNT.\ndataset_processes: int | None\n# Deduplicates datasets and test_datasets with identical entries\ndataset_exact_deduplication: bool | None\n# Keep dataset in memory while preprocessing. Only needed if cached dataset is taking\n# too much storage\ndataset_keep_in_memory: bool | None\ndataloader_pin_memory: bool | None\ndataloader_num_workers: int | None\ndataloader_prefetch_factor: int | None\ndataloader_drop_last: bool | None\n\naccelerator_config: dict[str, Any] | None\n\nremove_unused_columns: bool | None\n\n# Push prepared dataset to hub - repo_org/repo_name\npush_dataset_to_hub: str | None\n# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private\n# datasets. Required to be true when used in combination with `push_dataset_to_hub`\nhf_use_auth_token: bool | None\n\ndevice: Any | None\n# Passed through to transformers when loading the model when launched without\n# accelerate. Use `sequential` when training w/ model parallelism to limit memory\ndevice_map: Any | None\nworld_size: int | None\n# Don't mess with this, it's here for accelerate and torchrun\nlocal_rank: int | None\nddp: bool | None\n\n# Seed for reproducibility\nseed: int | None\n# Advanced DDP Arguments - timeout\nddp_timeout: int | None\n# Advanced DDP Arguments - bucket cap in MB\nddp_bucket_cap_mb: int | None\n# Advanced DDP Arguments - broadcast buffers\nddp_broadcast_buffers: bool | None\nddp_find_unused_parameters: bool | None\n\n# Approximate number of predictions sent to wandb depending on batch size. Enabled above\n# 0. Default is 0\neval_table_size: int | None\n# Total number of tokens generated for predictions sent to wandb. Default is 128\neval_max_new_tokens: int | None\n# Whether to run causal language model evaluation for metrics in\n# `eval_causal_lm_metrics`\ndo_causal_lm_eval: bool | None\n# HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter',\n# 'chrf', 'perplexity']\neval_causal_lm_metrics: list[str] | None\ndo_bench_eval: bool | None\nbench_dataset: str | None\nbench_split: str | None\nmetric_for_best_model: str | None\ngreater_is_better: bool | None\n\n# High loss value, indicating the learning has broken down (a good estimate is ~2 times\n# the loss at the start of training)\nloss_watchdog_threshold: float | None\n# Number of high-loss steps in a row before the trainer aborts (default: 3)\nloss_watchdog_patience: int | None\n\n# Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before\n# evaluations. Default is 0 (disabled).\ngc_steps: int | None\n\n# Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection.\n# require &gt;=ampere\nbf16: Literal['auto'] | bool | None = auto\n# Use CUDA fp16\nfp16: bool | None\n# Enable FP8 mixed precision training using TorchAO. Best used in combination with\n# torch.compile.\nfp8: bool | None\n# Enable FSDP float8 all-gather optimization for FP8 training. Can improve training\n# speed by 10-15% when FSDP is enabled.\nfp8_enable_fsdp_float8_all_gather: bool | None\n# No AMP (automatic mixed precision) - require &gt;=ampere\nbfloat16: bool | None\n# No AMP (automatic mixed precision)\nfloat16: bool | None\n# Use CUDA tf32 - require &gt;=ampere\ntf32: bool | None\nfloat32: bool | None\n\n# Whether to use gradient checkpointing. Available options are: true, false, 'offload',\n# 'offload_disk'.\n# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing\ngradient_checkpointing: Literal['offload', 'offload_disk'] | bool | None = False\n# Additional kwargs to pass to the trainer for gradient checkpointing\ngradient_checkpointing_kwargs: dict[str, Any] | None\n# Whether to offload activations. Available options are: true, false, 'legacy', 'disk'.\nactivation_offloading: Literal['legacy', 'disk'] | bool | None = False\n\nunfrozen_parameters: list[str] | None\n\n# The maximum length of an input to train with, this should typically be less than 2048\n# as most models have a token/context limit of 2048\nsequence_len: int = 512\n# What to do when a tokenized row exceeds sequence_len. 'drop' removes the row;\n# 'truncate' slices tensors to sequence_len. Defaults to 'drop' for backward\n# compatibility.\nexcess_length_strategy: Literal['drop', 'truncate'] | None\n# The maximum length of an input for evaluation. If not specified, defaults to\n# sequence_len\neval_sequence_len: int | None\nmin_sample_len: int | None\n# maximum prompt length for RL training\nmax_prompt_len: int = 512\n# Use efficient multi-packing with block diagonal attention and per sequence\n# position_ids. Recommend set to 'true'\nsample_packing: bool | None\n# The number of samples packed at a time. Increasing the following values helps with\n# packing, but usually only slightly (&lt;%1.)\nsample_packing_group_size: int | None = 100000\n# The number of samples which can be packed into one sequence. Increase if using a large\n# sequence_len with many short samples.\nsample_packing_bin_size: int | None = 200\n# Whether to pack samples sequentially\nsample_packing_sequentially: bool | None\n# The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or\n# 'forkserver'\nsample_packing_mp_start_method: str | None\n# Set to 'false' if getting errors during eval with sample_packing on\neval_sample_packing: bool | None\n# Pad inputs so each step uses constant sized buffers. This will reduce memory\n# fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to\n# True if `sample_packing` enabled\npad_to_sequence_len: bool | None\n# Whether to use sequential sampling for curriculum learning\ncurriculum_sampling: bool | None\nmultipack_real_batches: bool | None\n\n# Use batch flattening for speedups when not using sample_packing\nbatch_flattening: Literal['auto'] | bool | None\n\nuse_pose: bool | None\npose_split_on_token_ids: list[int] | None\npose_max_context_len: int | None\npose_num_chunks: int | None\n\npretrain_multipack_buffer_size: int | None\n# whether to prevent cross attention for packed sequences during pretraining\npretrain_multipack_attn: bool | None = True\n# whether to concatenate samples during pretraining\npretraining_sample_concatenation: bool | None\n\n# Use streaming mode for loading datasets\nstreaming: bool | None\n# Buffer size for multipack streaming datasets\nstreaming_multipack_buffer_size: int | None = 10000\n\n# Whether to use xformers attention patch https://github.com/facebookresearch/xformers\nxformers_attention: bool | None\n# Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/\n# torch.nn.functional.scaled_dot_product_attention.html\nsdp_attention: bool | None\n# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf\ns2_attention: bool | None\nflex_attention: bool | None\nflex_attn_compile_kwargs: dict[str, Any] | None\n# Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention\nflash_attention: bool | None\n# Whether to use flash-attention cross entropy implementation - advanced use only\nflash_attn_cross_entropy: bool | None\n# Whether to use flash-attention rms norm implementation - advanced use only\nflash_attn_rms_norm: bool | None\n# Whether to fuse part of the MLP into a single operation\nflash_attn_fuse_mlp: bool | None\n# Whether to use bettertransformers\nflash_optimum: bool | None\n\neager_attention: bool | None\n\n# Specify a custom attention implementation, used mostly for kernels.\nattn_implementation: str | None\n\nunsloth_cross_entropy_loss: bool | None\nunsloth_lora_mlp: bool | None\nunsloth_lora_qkv: bool | None\nunsloth_lora_o: bool | None\nunsloth_rms_norm: bool | None\nunsloth_rope: bool | None\n\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_mlp_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_qkv_kernel: bool | None\n# Apply custom LoRA autograd functions and activation function Triton kernels for speed\n# and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html\nlora_o_kernel: bool | None\n\n# Whether to use chunked cross entropy loss for memory efficiency\nchunked_cross_entropy: bool | None\n# Number of chunks to use for chunked cross entropy loss\nchunked_cross_entropy_num_chunks: int | None\n\n# Whether to use ALST tiled mlp for memory efficient long context\ntiled_mlp: bool | None\n\n# Number of shards to use for ALST tiled mlp. If unset, it will be set based on\n# seqlen/hidden_size\ntiled_mlp_num_shards: int | None\n\n# Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on\n# llama.\ntiled_mlp_use_original_mlp: bool | None = True\n\nllama4_linearized_experts: bool | None\n\n# Deepspeed config path. e.g., deepspeed_configs/zero3.json\ndeepspeed: str | dict[str, Any] | None\n# Whether to use deepcompile for faster training with deepspeed\ndeepcompile: bool | None\n# FSDP configuration\nfsdp: list[str] | None\n\n# FSDP configuration options\nfsdp_config: dict[str, Any] | None\n# FSDP version\nfsdp_version: int | None\nfsdp_final_state_dict_type: Literal['FULL_STATE_DICT', 'LOCAL_STATE_DICT', 'SHARDED_STATE_DICT'] | None\n\n# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for\n# no eval.\nval_set_size: float | None = 0.0\n\n# Number of devices to shard across. If not set, will use all available devices.\ndp_shard_size: int | None\n# Number of devices to replicate across.\ndp_replicate_size: int | None\n# Deprecated: use `context_parallel_size` instead\nsequence_parallel_degree: int | None\n# Set to a divisor of the number of GPUs available to split sequences into chunks of\n# equal size. Use in long context training to prevent OOM when sequences cannot fit into\n# a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each\n# sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized\n# subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more\n# details.\ncontext_parallel_size: int | None\n# Optional; strides across the key dimension. Larger values use more memory but should\n# make training faster. Must evenly divide the number of KV heads in your model.\nheads_k_stride: int | None\n# One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to\n# 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing\n# case.\nring_attn_func: RingAttnFunc | None\n# Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP.\ntensor_parallel_size: int | None\n\n# Add or change special tokens. If you add tokens here, you don't need to add them to\n# the `tokens` list.\nspecial_tokens: SpecialTokensConfig | None\n  # For SpecialTokensConfig:\n  bos_token: str | None\n  eos_token: str | None\n  pad_token: str | None\n  unk_token: str | None\n  additional_special_tokens: list[str] | None\n\n# Add extra tokens to the tokenizer\ntokens: list[str] | None\n# Mapping token_id to new_token_string to override reserved added_tokens in the\n# tokenizer. Only works for tokens that are not part of the base vocab (aka are\n# added_tokens). Can be checked if they exist in tokenizer.json added_tokens.\nadded_tokens_overrides: dict[int, str] | None\n\n# Whether to use torch.compile and which backend to use. setting to `auto` will enable\n# torch compile when torch&gt;=2.6.0\ntorch_compile: Literal['auto'] | bool | None\n# Backend to use for torch.compile\ntorch_compile_backend: str | None\ntorch_compile_mode: Literal['default', 'reduce-overhead', 'max-autotune'] | None\n\n# Maximum number of iterations to train for. It precedes num_epochs which means that if\n# both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps =&gt;\n# `num_epochs: 2` and `max_steps: 100` will train for 100 steps\nmax_steps: int | None\n# Number of warmup steps. Cannot use with warmup_ratio\nwarmup_steps: int | None\n# Warmup ratio. Cannot use with warmup_steps\nwarmup_ratio: float | None\n# Leave empty to eval at each epoch, integer for every N steps. float for fraction of\n# total steps\neval_steps: int | float | None\n# Number of times per epoch to run evals, mutually exclusive with eval_steps\nevals_per_epoch: int | None\n# Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer\n# from `eval_steps`\neval_strategy: str | None\n\n# Leave empty to save at each epoch, integer for every N steps. float for fraction of\n# total steps\nsave_steps: int | float | None\n# Number of times per epoch to save a checkpoint, mutually exclusive with save_steps\nsaves_per_epoch: int | None\n# Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better\n# result is achieved, leave empty to infer from `save_steps`\nsave_strategy: str | None\n# Checkpoints saved at a time\nsave_total_limit: int | None\n# Whether to checkpoint a model after the first step of training. Defaults to False.\nsave_first_step: bool | None\n\n# Logging frequency\nlogging_steps: int | None\n# Stop training after this many evaluation losses have increased in a row. https://huggi\n# ngface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppin\n# gCallback\nearly_stopping_patience: int | None\nload_best_model_at_end: bool | None = False\n# Save only the model weights, skipping the optimizer. Using this means you can't resume\n# from checkpoints.\nsave_only_model: bool | None = False\n# Use tensorboard for logging\nuse_tensorboard: bool | None\n# Enable the pytorch profiler to capture the first N steps of training to the\n# output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more\n# information. Snapshots can be visualized @ https://pytorch.org/memory_viz\nprofiler_steps: int | None\n# Which step to start the profiler at. Useful for only capturing a few steps mid-run.\nprofiler_steps_start: int | None = 0\n# bool of whether to report tokens per second at the end of training. This is not\n# supported with pre-training datasets.\ninclude_tokens_per_second: bool | None\n# bool of whether to report tokens per second during training by measuring throughput of\n# non-padding tokens.\ninclude_tkps: bool | None\n# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to\n# add noise to embeddings. Currently only supported on Llama and Mistral\nneftune_noise_alpha: float | None\n\n# Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to\n# `beta` in `ORPOConfig` due to trl mapping.\norpo_alpha: float | None\n# Weighting of NLL term in loss from RPO paper\nrpo_alpha: float | None\n# Target reward margin for the SimPO loss\nsimpo_gamma: float | None\n# Weight of the BC regularizer\ncpo_alpha: float | None\n\n# Factor for desirable loss term in KTO loss\nkto_desirable_weight: float | None\n# Factor for undesirable loss term in KTO loss\nkto_undesirable_weight: float | None\n# The beta parameter for the RL training\nrl_beta: float | None\n\n# Defines the max memory usage per gpu on the system. Passed through to transformers\n# when loading the model.\nmax_memory: dict[int | Literal['cpu', 'disk'], int | str] | None\n# Limit the memory for all available GPUs to this amount (if an integer, expressed in\n# gigabytes); default: unset\ngpu_memory_limit: int | str | None\n# Whether to use low_cpu_mem_usage\nlow_cpu_mem_usage: bool | None\n\n# The name of the chat template to use for training, following values are supported:\n# tokenizer_default: Uses the chat template that is available in the\n# tokenizer_config.json. If the chat template is not available in the tokenizer, it will\n# raise an error. This is the default value.\n# alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates\n# are available in the axolotl codebase at src/axolotl/utils/chat_templates.py.\n# tokenizer_default_fallback_*: where * is the name of the chat template to fallback to.\n# E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not\n# available in the tokenizer. jinja: Uses a custom jinja template for the chat template.\n# The custom jinja template should be provided in the chat_template_jinja field. The\n# selected chat template will be saved to the tokenizer_config.json for easier\n# inferencing\nchat_template: ChatTemplate | Annotated[str, StringConstraints(pattern='^tokenizer_default_fallback_')] | None\n# Custom jinja template or path to jinja file for chat template. This will be only used\n# if chat_template is set to `jinja` or `null` (in which case chat_template is\n# automatically set to `jinja`). Default is null.\nchat_template_jinja: str | None\n# Additional kwargs to pass to the chat template. This is useful for customizing the\n# chat template. For example, you can pass `thinking=False` to add a generation prompt\n# to the chat template.\nchat_template_kwargs: dict[str, Any] | None\n# Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the\n# boundaries between conversation turns. For example: ['/INST', '&lt;/s&gt;',\n# '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is\n# useful for templates that use multiple delimiter tokens.\neot_tokens: list[str] | None\n# Changes the default system message. Currently only supports chatml.\ndefault_system_message: str | None\n\nfix_untrained_tokens: int | list[int] | None\n\nis_preprocess: bool | None\npreprocess_iterable: bool | None\n\n# Total number of tokens - internal use\ntotal_num_tokens: int | None\ntotal_supervised_tokens: int | None\n# You can set these packing optimizations AFTER starting a training at least once. The\n# trainer will provide recommended values for these values.\nsample_packing_eff_est: float | None\naxolotl_config_path: str | None\n\n# Internal use only - Used to identify which the model is based on\nis_falcon_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_llama_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on. Please note that if\n# you set this to true, `padding_side` will be set to 'left' by default\nis_mistral_derived_model: bool | None\n# Internal use only - Used to identify which the model is based on\nis_qwen_derived_model: bool | None\n\n# Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available\n# plugins or doc below for more details.\n# https://docs.axolotl.ai/docs/custom_integrations.html\nplugins: list[str] | None\n\n# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This\n# can also be a relative path to a model on disk\nbase_model: str (required)\n# If the base_model repo on hf hub doesn't include configuration .json files, You can\n# set that here, or leave this empty to default to base_model\nbase_model_config: str | None\ncls_model_config: str | None\n# Optional tokenizer configuration path in case you want to use a different tokenizer\n# than the one defined in the base model\ntokenizer_config: str | None\n# use_fast option for tokenizer loading from_pretrained, default to True\ntokenizer_use_fast: bool | None\n# Whether to use the legacy tokenizer setting, defaults to True\ntokenizer_legacy: bool | None\n# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-\n# common tokenizer.\ntokenizer_use_mistral_common: bool | None\n# Corresponding tokenizer for the model AutoTokenizer is a good choice\ntokenizer_type: str | None\n# transformers processor class\nprocessor_type: str | None\n# Whether to save jinja files for tokenizer, transformers default is True\ntokenizer_save_jinja_files: bool | None = True\n# Trust remote code for untrusted source\ntrust_remote_code: bool | None\n\n# Don't move the model to the device before sharding. Set to `false` to revert to legacy\n# behavior.\nexperimental_skip_move_to_device: bool | None = True\n\n# Use custom kernels, e.g. MegaBlocks.\nuse_kernels: bool | None\n\n# Model loading quantization config\nmodel_quantization_config: Literal['Mxfp4Config'] | None\n# kwargs for model quantization config\nmodel_quantization_config_kwargs: dict[str, Any] | None\n\n# Where to save the full-finetuned model to\noutput_dir: str = ./model-out\n# push checkpoints to hub\nhub_model_id: str | None\n# how to push checkpoints to hub\nhub_strategy: str | None\n# Save model as safetensors (require safetensors package). Default True\nsave_safetensors: bool | None = True\n\n# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer\nload_in_8bit: bool | None = False\n# Use bitsandbytes 4 bit\nload_in_4bit: bool | None = False\n\n# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in\n# original model\nadapter: str | None\n# If you already have a lora model trained that you want to load, put that here. This\n# means after training, if you want to test the model, you should set this to the value\n# of `output_dir`. Note that if you merge an adapter to the base model, a new\n# subdirectory `merged` will be created under the `output_dir`.\nlora_model_dir: str | None\nlora_r: int | None\nlora_alpha: int | None\nlora_fan_in_fan_out: bool | None\nlora_target_modules: str | list[str] | None\nlora_target_parameters: str | list[str] | None\n# If true, will target all linear modules\nlora_target_linear: bool | None\n# If you added new tokens to the tokenizer, you may need to save some LoRA modules\n# because they need to know the new tokens. For LLaMA and Mistral, you need to save\n# `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts\n# tokens to embeddings, and `lm_head` converts embeddings to token probabilities.\nlora_modules_to_save: list[str] | None\nlora_dropout: float | None = 0.0\n# The layer indices to transform, otherwise, apply to all layers\npeft_layers_to_transform: list[int] | None\npeft_layers_pattern: list[str] | None\n\npeft: PeftConfig | None\n  # For PeftConfig:\n  # Configuration options for loftq initialization for LoRA\n  loftq_config: LoftQConfig | None\n    # For LoftQConfig:\n    # typically 4 bits\n    loftq_bits: int = 4\n\n# Whether to use DoRA.\npeft_use_dora: bool | None\n# Whether to use RSLoRA.\npeft_use_rslora: bool | None\n# List of layer indices to replicate.\npeft_layer_replication: list[tuple[int, int]] | None\n# How to initialize LoRA weights. Default to True which is MS original implementation.\npeft_init_lora_weights: bool | str | None\n\n# load qlora model in sharded format for FSDP using answer.ai technique.\nqlora_sharded_model_loading: bool | None = False\n# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it\n# takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge\nlora_on_cpu: bool | None\n# Whether you are training a 4-bit GPTQ quantized model\ngptq: bool | None\n# optional overrides to the bnb 4bit quantization configuration\nbnb_config_kwargs: dict[str, Any] | None\n\n# loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.\nloraplus_lr_ratio: float | None\n# loraplus learning rate for lora embedding layers. Default value is 1e-6.\nloraplus_lr_embedding: float | None = 1e-06\n\nmerge_lora: bool | None\n\n# Whether to use ReLoRA. Use with jagged_restart_*steps options.\nrelora: bool | None\n# threshold for optimizer magnitude when pruning\nrelora_prune_ratio: float | None\n# True to perform lora weight merges on cpu during restarts, for modest gpu memory\n# savings\nrelora_cpu_offload: bool | None\n\n# how often to reset for jagged restarts\njagged_restart_steps: int | None\n# how many warmup steps to take after reset for jagged restarts\njagged_restart_warmup_steps: int | None\n# how many anneal steps to take before reset for jagged restarts\njagged_restart_anneal_steps: int | None\n\n# If greater than 1, backpropagation will be skipped and the gradients will be\n# accumulated for the given number of steps.\ngradient_accumulation_steps: int | None = 1\n# The number of samples to include in each batch. This is the number of samples sent to\n# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps\nmicro_batch_size: int | None = 1\n# Total batch size, we do not recommended setting this manually\nbatch_size: int | None\n# per gpu micro batch size for evals, defaults to value of micro_batch_size\neval_batch_size: int | None\n\n# whether to find batch size that fits in memory. Passed to underlying transformers\n# Trainer\nauto_find_batch_size: bool | None\n\n# Whether to mask out or include the human's prompt from the training labels\ntrain_on_inputs: bool | None = False\n# Group similarly sized data to minimize padding. May be slower to start, as it must\n# download and sort the entire dataset. Note that training loss may have an oscillating\n# pattern with this enabled.\ngroup_by_length: bool | None\n\nlearning_rate: str | float (required)\nembedding_lr: float | None\nembedding_lr_scale: float | None\n# Specify weight decay\nweight_decay: float | None = 0.0\n# Specify optimizer\noptimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED\n# Dictionary of arguments to pass to the optimizer\noptim_args: str | dict[str, Any] | None\n# The target modules to optimize, i.e. the module names that you would like to train,\n# right now this is used only for GaLore algorithm\noptim_target_modules: list[str] | Literal['all_linear'] | None\n# Path to torch distx for optim 'adamw_anyprecision'\ntorchdistx_path: str | None\nlr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE\n# Specify a scheduler and kwargs to use with the optimizer\nlr_scheduler_kwargs: dict[str, Any] | None\nlr_quadratic_warmup: bool | None\n# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of\n# peak lr\ncosine_min_lr_ratio: float | None\n# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means\n# start cosine_min_lr at 80% of training step\ncosine_constant_lr_ratio: float | None\n# Learning rate div factor\nlr_div_factor: float | None\n\nlr_groups: list[LrGroup] | None\n  # For LrGroup:\n  name: str (required)\n  modules: list[str] (required)\n  lr: float (required)\n\n# adamw hyperparams\nadam_epsilon: float | None\n# only used for CAME Optimizer\nadam_epsilon2: float | None\n# adamw hyperparams\nadam_beta1: float | None\n# adamw hyperparams\nadam_beta2: float | None\n# only used for CAME Optimizer\nadam_beta3: float | None\n\n# Dion Optimizer learning rate\ndion_lr: float | None\n# Dion Optimizer momentum\ndion_momentum: float | None\n# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank\n# dimension.\ndion_rank_fraction: float | None = 1.0\n# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may\n# be useful to ensure even sharding.\ndion_rank_multiple_of: int | None = 1\n\n# Gradient clipping max norm\nmax_grad_norm: float | None\nnum_epochs: float = 1.0\n\nuse_wandb: bool | None\n# Set the name of your wandb run\nwandb_name: str | None\n# Set the ID of your wandb run\nwandb_run_id: str | None\n# \"offline\" to save run metadata locally and not sync to the server, \"disabled\" to turn\n# off wandb\nwandb_mode: str | None\n# Your wandb project name\nwandb_project: str | None\n# A wandb Team name if using a Team\nwandb_entity: str | None\nwandb_watch: str | None\n# \"checkpoint\" to log model to wandb Artifacts every `save_steps` or \"end\" to log only\n# at the end of training\nwandb_log_model: str | None\n\nuse_mlflow: bool | None\n# URI to mlflow\nmlflow_tracking_uri: str | None\n# Your experiment name\nmlflow_experiment_name: str | None\n# Your run name\nmlflow_run_name: str | None\n# set to true to copy each saved checkpoint on each save to mlflow artifact registry\nhf_mlflow_log_artifacts: bool | None\n\n# Enable or disable Comet integration.\nuse_comet: bool | None\n# API key for Comet. Recommended to set via `comet login`.\ncomet_api_key: str | None\n# Workspace name in Comet. Defaults to the user's default workspace.\ncomet_workspace: str | None\n# Project name in Comet. Defaults to Uncategorized.\ncomet_project_name: str | None\n# Identifier for the experiment. Used to append data to an existing experiment or\n# control the key of new experiments. Default to a random key.\ncomet_experiment_key: str | None\n# Create a new experiment (\"create\") or log to an existing one (\"get\"). Default\n# (\"get_or_create\") auto-selects based on configuration.\ncomet_mode: str | None\n# Set to True to log data to Comet server, or False for offline storage. Default is\n# True.\ncomet_online: bool | None\n# Dictionary for additional configuration settings, see the doc for more details.\ncomet_experiment_config: dict[str, Any] | None\n\n# the number of activate layers in LISA\nlisa_n_layers: int | None\n# how often to switch layers in LISA\nlisa_step_interval: int | None\n# path under the model to access the layers\nlisa_layers_attribute: str | None = model.layers\n\ngradio_title: str | None\ngradio_share: bool | None\ngradio_server_name: str | None\ngradio_server_port: int | None\ngradio_max_new_tokens: int | None\ngradio_temperature: float | None\n\nuse_ray: bool = False\nray_run_name: str | None\nray_num_workers: int = 1\nresources_per_worker: dict\n\n# The size of the image to resize to. It can be an integer (resized into padded-square\n# image) or a tuple (width, height).If not provided, we will attempt to load from\n# preprocessor.size, otherwise, images won't be resized.\nimage_size: int | tuple[int, int] | None\n# The resampling algorithm to use for image resizing. Default is bilinear. Please refer\n# to PIL.Image.Resampling for more details.\nimage_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None\n\n# optional overrides to the base model configuration\noverrides_of_model_config: dict[str, Any] | None\n# optional overrides the base model loading from_pretrained\noverrides_of_model_kwargs: dict[str, Any] | None\n# If you want to specify the type of model to load, AutoModelForCausalLM is a good\n# choice too\ntype_of_model: str | None\n# You can specify to choose a specific model revision from huggingface hub\nrevision_of_model: str | None\n\nmax_packed_sequence_len: int | None\nrope_scaling: Any | None\nnoisy_embedding_alpha: float | None\ndpo_beta: float | None\nevaluation_strategy: str | None",
     "crumbs": [
       "Getting Started",
       "Config Reference"
diff --git a/sitemap.xml b/sitemap.xml
index ff59e7d03..235e2bf90 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,790 +2,794 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2025-08-29T17:52:59.100Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.254Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2025-08-29T17:52:59.117Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.273Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/FAQS.html</loc>
-    <lastmod>2025-08-29T17:52:59.090Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.244Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/unsloth.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.250Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/reward_modelling.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/docker.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/batch_vs_grad.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.245Z</lastmod>
   </url>
   <url>
-    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
-    <lastmod>2025-08-29T17:56:24.493Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
-    <lastmod>2025-08-29T17:56:25.159Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
-    <lastmod>2025-08-29T17:56:25.814Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
-    <lastmod>2025-08-29T17:56:25.093Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
-    <lastmod>2025-08-29T17:56:24.828Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
-    <lastmod>2025-08-29T17:56:25.183Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
-    <lastmod>2025-08-29T17:56:25.601Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
-    <lastmod>2025-08-29T17:56:25.115Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
-    <lastmod>2025-08-29T17:56:25.793Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
-    <lastmod>2025-08-29T17:56:24.690Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
-    <lastmod>2025-08-29T17:56:25.457Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
-    <lastmod>2025-08-29T17:56:24.648Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
-    <lastmod>2025-08-29T17:56:25.903Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
-    <lastmod>2025-08-29T17:56:24.570Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
-    <lastmod>2025-08-29T17:56:25.116Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
-    <lastmod>2025-08-29T17:56:24.919Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
-    <lastmod>2025-08-29T17:56:24.938Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
-    <lastmod>2025-08-29T17:56:24.961Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
-    <lastmod>2025-08-29T17:56:25.380Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
-    <lastmod>2025-08-29T17:56:25.896Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
-    <lastmod>2025-08-29T17:56:25.299Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
-    <lastmod>2025-08-29T17:56:24.793Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
-    <lastmod>2025-08-29T17:56:25.789Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
-    <lastmod>2025-08-29T17:56:25.816Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
-    <lastmod>2025-08-29T17:56:24.839Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
-    <lastmod>2025-08-29T17:56:24.856Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
-    <lastmod>2025-08-29T17:56:24.917Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
-    <lastmod>2025-08-29T17:56:25.899Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
-    <lastmod>2025-08-29T17:56:25.440Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
-    <lastmod>2025-08-29T17:56:25.280Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
-    <lastmod>2025-08-29T17:56:25.138Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
-    <lastmod>2025-08-29T17:56:25.077Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
-    <lastmod>2025-08-29T17:56:24.745Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
-    <lastmod>2025-08-29T17:56:25.298Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
-    <lastmod>2025-08-29T17:56:25.343Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
-    <lastmod>2025-08-29T17:56:25.799Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
-    <lastmod>2025-08-29T17:56:25.777Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
-    <lastmod>2025-08-29T17:56:25.056Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
-    <lastmod>2025-08-29T17:56:25.029Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
-    <lastmod>2025-08-29T17:56:25.592Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
-    <lastmod>2025-08-29T17:56:24.608Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
-    <lastmod>2025-08-29T17:56:25.179Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
-    <lastmod>2025-08-29T17:56:24.816Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
-    <lastmod>2025-08-29T17:56:25.797Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
-    <lastmod>2025-08-29T17:56:24.905Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
-    <lastmod>2025-08-29T17:56:25.083Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
-    <lastmod>2025-08-29T17:56:25.835Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
-    <lastmod>2025-08-29T17:56:25.270Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
-    <lastmod>2025-08-29T17:56:25.484Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
-    <lastmod>2025-08-29T17:56:25.313Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
-    <lastmod>2025-08-29T17:56:25.369Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
-    <lastmod>2025-08-29T17:56:24.955Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
-    <lastmod>2025-08-29T17:56:25.044Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
-    <lastmod>2025-08-29T17:56:24.773Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
-    <lastmod>2025-08-29T17:56:24.631Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
-    <lastmod>2025-08-29T17:56:25.354Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
-    <lastmod>2025-08-29T17:56:25.381Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
-    <lastmod>2025-08-29T17:56:25.604Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
-    <lastmod>2025-08-29T17:56:24.591Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
-    <lastmod>2025-08-29T17:56:24.833Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
-    <lastmod>2025-08-29T17:56:25.351Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
-    <lastmod>2025-08-29T17:56:25.419Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
-    <lastmod>2025-08-29T17:56:25.526Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-29T17:56:25.308Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
-    <lastmod>2025-08-29T17:56:25.910Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
-    <lastmod>2025-08-29T17:56:24.727Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
-    <lastmod>2025-08-29T17:56:25.638Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
-    <lastmod>2025-08-29T17:56:25.622Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
-    <lastmod>2025-08-29T17:56:25.574Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
-    <lastmod>2025-08-29T17:56:25.778Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-29T17:56:25.305Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
-    <lastmod>2025-08-29T17:56:25.429Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
-    <lastmod>2025-08-29T17:56:25.843Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
-    <lastmod>2025-08-29T17:56:24.764Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
-    <lastmod>2025-08-29T17:56:25.363Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
-    <lastmod>2025-08-29T17:52:59.091Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/faq.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/installation.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
-    <lastmod>2025-08-29T17:56:24.882Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
-    <lastmod>2025-08-29T17:56:24.808Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
-    <lastmod>2025-08-29T17:56:24.632Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
-    <lastmod>2025-08-29T17:56:25.309Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
-    <lastmod>2025-08-29T17:56:24.995Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
-    <lastmod>2025-08-29T17:56:24.682Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
-    <lastmod>2025-08-29T17:56:25.610Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
-    <lastmod>2025-08-29T17:56:25.290Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
-    <lastmod>2025-08-29T17:56:25.139Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
-    <lastmod>2025-08-29T17:56:24.964Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
-    <lastmod>2025-08-29T17:56:25.064Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
-    <lastmod>2025-08-29T17:56:25.424Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
-    <lastmod>2025-08-29T17:56:25.110Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
-    <lastmod>2025-08-29T17:56:24.937Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
-    <lastmod>2025-08-29T17:56:25.891Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
-    <lastmod>2025-08-29T17:56:25.306Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
-    <lastmod>2025-08-29T17:56:25.517Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
-    <lastmod>2025-08-29T17:56:24.954Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
-    <lastmod>2025-08-29T17:56:24.971Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
-    <lastmod>2025-08-29T17:56:25.371Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
-    <lastmod>2025-08-29T17:56:24.996Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.data.pretraining.html</loc>
-    <lastmod>2025-08-29T17:56:25.519Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-29T17:56:25.352Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
-    <lastmod>2025-08-29T17:56:25.043Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
-    <lastmod>2025-08-29T17:56:24.888Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
-    <lastmod>2025-08-29T17:56:25.884Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
-    <lastmod>2025-08-29T17:56:25.433Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
-    <lastmod>2025-08-29T17:56:25.894Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
-    <lastmod>2025-08-29T17:56:25.632Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
-    <lastmod>2025-08-29T17:56:25.815Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
-    <lastmod>2025-08-29T17:56:24.635Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
-    <lastmod>2025-08-29T17:56:24.580Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
-    <lastmod>2025-08-29T17:56:24.929Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
-    <lastmod>2025-08-29T17:56:25.314Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
-    <lastmod>2025-08-29T17:56:24.867Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
-    <lastmod>2025-08-29T17:56:25.417Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
-    <lastmod>2025-08-29T17:56:25.149Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
-    <lastmod>2025-08-29T17:56:24.528Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
-    <lastmod>2025-08-29T17:56:24.515Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
-    <lastmod>2025-08-29T17:56:25.839Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
-    <lastmod>2025-08-29T17:56:25.104Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
-    <lastmod>2025-08-29T17:56:25.560Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
-    <lastmod>2025-08-29T17:56:24.894Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
-    <lastmod>2025-08-29T17:56:25.510Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
-    <lastmod>2025-08-29T17:56:25.546Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
-    <lastmod>2025-08-29T17:56:25.410Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
-    <lastmod>2025-08-29T17:56:24.435Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
-    <lastmod>2025-08-29T17:56:25.774Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
-    <lastmod>2025-08-29T17:56:25.504Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
-    <lastmod>2025-08-29T17:56:24.641Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
-    <lastmod>2025-08-29T17:56:25.141Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
-    <lastmod>2025-08-29T17:56:25.100Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
-    <lastmod>2025-08-29T17:56:25.136Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
-    <lastmod>2025-08-29T17:56:24.595Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
-    <lastmod>2025-08-29T17:56:24.844Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
-    <lastmod>2025-08-29T17:56:24.798Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
-    <lastmod>2025-08-29T17:56:24.805Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
-    <lastmod>2025-08-29T17:56:25.089Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
-    <lastmod>2025-08-29T17:56:24.504Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
-    <lastmod>2025-08-29T17:56:25.384Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
-    <lastmod>2025-08-29T17:56:24.718Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
-    <lastmod>2025-08-29T17:56:25.157Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
-    <lastmod>2025-08-29T17:56:24.634Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
-    <lastmod>2025-08-29T17:56:25.126Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
-    <lastmod>2025-08-29T17:56:24.721Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
-    <lastmod>2025-08-29T17:56:25.567Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
-    <lastmod>2025-08-29T17:56:24.699Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
-    <lastmod>2025-08-29T17:56:24.750Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
-    <lastmod>2025-08-29T17:56:25.786Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
-    <lastmod>2025-08-29T17:56:25.359Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
-    <lastmod>2025-08-29T17:56:24.815Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
-    <lastmod>2025-08-29T17:56:24.785Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
-    <lastmod>2025-08-29T17:56:24.586Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
-    <lastmod>2025-08-29T17:56:24.944Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
-    <lastmod>2025-08-29T17:56:25.796Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
-    <lastmod>2025-08-29T17:52:59.095Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
-  </url>
-  <url>
-    <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
+    <loc>https://docs.axolotl.ai/docs/streaming.html</loc>
+    <lastmod>2025-09-02T16:08:56.250Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/nccl.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/quantize.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/nd_parallelism.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/custom_integrations.html</loc>
+    <lastmod>2025-09-02T16:08:56.245Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/debugging.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/gradient_checkpointing.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multimodal.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset_loading.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html</loc>
+    <lastmod>2025-09-02T16:12:16.169Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.adapter.html</loc>
+    <lastmod>2025-09-02T16:12:15.313Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.base.html</loc>
+    <lastmod>2025-09-02T16:12:14.956Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html</loc>
+    <lastmod>2025-09-02T16:12:15.154Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html</loc>
+    <lastmod>2025-09-02T16:12:15.184Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html</loc>
+    <lastmod>2025-09-02T16:12:15.729Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html</loc>
+    <lastmod>2025-09-02T16:12:16.159Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html</loc>
+    <lastmod>2025-09-02T16:12:15.120Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.evaluate.html</loc>
+    <lastmod>2025-09-02T16:12:15.069Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.model.html</loc>
+    <lastmod>2025-09-02T16:12:15.937Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.art.html</loc>
+    <lastmod>2025-09-02T16:12:15.091Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html</loc>
+    <lastmod>2025-09-02T16:12:15.495Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html</loc>
+    <lastmod>2025-09-02T16:12:15.004Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html</loc>
+    <lastmod>2025-09-02T16:12:15.526Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.args.html</loc>
+    <lastmod>2025-09-02T16:12:15.088Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html</loc>
+    <lastmod>2025-09-02T16:12:15.753Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/evaluate.html</loc>
+    <lastmod>2025-09-02T16:12:14.879Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html</loc>
+    <lastmod>2025-09-02T16:12:15.458Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.vllm_serve.html</loc>
+    <lastmod>2025-09-02T16:12:15.174Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.quantize.html</loc>
+    <lastmod>2025-09-02T16:12:15.167Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html</loc>
+    <lastmod>2025-09-02T16:12:15.213Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.rl.html</loc>
+    <lastmod>2025-09-02T16:12:14.966Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html</loc>
+    <lastmod>2025-09-02T16:12:15.506Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html</loc>
+    <lastmod>2025-09-02T16:12:15.469Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html</loc>
+    <lastmod>2025-09-02T16:12:15.510Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.datasets.chat.html</loc>
+    <lastmod>2025-09-02T16:12:15.011Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.distributed.html</loc>
+    <lastmod>2025-09-02T16:12:15.874Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.base.html</loc>
+    <lastmod>2025-09-02T16:12:16.147Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/index.html</loc>
+    <lastmod>2025-09-02T16:12:14.810Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html</loc>
+    <lastmod>2025-09-02T16:12:15.779Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.quantization.html</loc>
+    <lastmod>2025-09-02T16:12:15.916Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.dict.html</loc>
+    <lastmod>2025-09-02T16:12:15.879Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html</loc>
+    <lastmod>2025-09-02T16:12:15.263Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.config.html</loc>
+    <lastmod>2025-09-02T16:12:15.930Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html</loc>
+    <lastmod>2025-09-02T16:12:15.473Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mamba.html</loc>
+    <lastmod>2025-09-02T16:12:16.212Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/datasets.html</loc>
+    <lastmod>2025-09-02T16:12:14.886Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/convert.html</loc>
+    <lastmod>2025-09-02T16:12:14.899Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html</loc>
+    <lastmod>2025-09-02T16:12:15.518Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.tokenization.html</loc>
+    <lastmod>2025-09-02T16:12:15.786Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.base.html</loc>
+    <lastmod>2025-09-02T16:12:15.236Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html</loc>
+    <lastmod>2025-09-02T16:12:15.684Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.model.html</loc>
+    <lastmod>2025-09-02T16:12:15.298Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/logging_config.html</loc>
+    <lastmod>2025-09-02T16:12:14.950Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.shared.html</loc>
+    <lastmod>2025-09-02T16:12:15.006Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html</loc>
+    <lastmod>2025-09-02T16:12:16.188Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.enums.html</loc>
+    <lastmod>2025-09-02T16:12:16.002Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html</loc>
+    <lastmod>2025-09-02T16:12:16.267Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.bench.html</loc>
+    <lastmod>2025-09-02T16:12:15.802Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html</loc>
+    <lastmod>2025-09-02T16:12:16.257Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mamba.html</loc>
+    <lastmod>2025-09-02T16:12:15.256Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html</loc>
+    <lastmod>2025-09-02T16:12:15.412Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html</loc>
+    <lastmod>2025-09-02T16:12:15.721Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.messages.html</loc>
+    <lastmod>2025-09-02T16:12:15.001Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html</loc>
+    <lastmod>2025-09-02T16:12:15.723Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html</loc>
+    <lastmod>2025-09-02T16:12:15.750Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html</loc>
+    <lastmod>2025-09-02T16:12:15.340Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.patch_manager.html</loc>
+    <lastmod>2025-09-02T16:12:15.323Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html</loc>
+    <lastmod>2025-09-02T16:12:15.887Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html</loc>
+    <lastmod>2025-09-02T16:12:15.676Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html</loc>
+    <lastmod>2025-09-02T16:12:16.263Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.tokenizer.html</loc>
+    <lastmod>2025-09-02T16:12:15.306Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html</loc>
+    <lastmod>2025-09-02T16:12:15.479Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.lora.html</loc>
+    <lastmod>2025-09-02T16:12:15.793Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html</loc>
+    <lastmod>2025-09-02T16:12:15.433Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html</loc>
+    <lastmod>2025-09-02T16:12:15.333Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html</loc>
+    <lastmod>2025-09-02T16:12:15.509Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.swiglu.html</loc>
+    <lastmod>2025-09-02T16:12:15.660Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html</loc>
+    <lastmod>2025-09-02T16:12:15.979Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.main.html</loc>
+    <lastmod>2025-09-02T16:12:15.052Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html</loc>
+    <lastmod>2025-09-02T16:12:15.363Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html</loc>
+    <lastmod>2025-09-02T16:12:15.679Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html</loc>
+    <lastmod>2025-09-02T16:12:15.003Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.cloud.base.html</loc>
+    <lastmod>2025-09-02T16:12:15.178Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.trl.html</loc>
+    <lastmod>2025-09-02T16:12:15.251Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/mixed_precision.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/installation.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/mac.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/sequence_parallelism.html</loc>
+    <lastmod>2025-09-02T16:08:56.250Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/faq.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/tokenized.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/pretraining.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/template_free.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/index.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/conversation.html</loc>
+    <lastmod>2025-09-02T16:08:56.245Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/lr_groups.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/inference.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/lora_optims.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multipack.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/amd_hpc.html</loc>
+    <lastmod>2025-09-02T16:08:56.245Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html</loc>
+    <lastmod>2025-09-02T16:12:15.732Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.inference.html</loc>
+    <lastmod>2025-09-02T16:12:15.134Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html</loc>
+    <lastmod>2025-09-02T16:12:16.216Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html</loc>
+    <lastmod>2025-09-02T16:12:15.798Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html</loc>
+    <lastmod>2025-09-02T16:12:15.674Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html</loc>
+    <lastmod>2025-09-02T16:12:16.151Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.training.html</loc>
+    <lastmod>2025-09-02T16:12:15.944Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html</loc>
+    <lastmod>2025-09-02T16:12:15.991Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.utils.html</loc>
+    <lastmod>2025-09-02T16:12:16.008Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.checks.html</loc>
+    <lastmod>2025-09-02T16:12:15.098Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html</loc>
+    <lastmod>2025-09-02T16:12:16.282Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html</loc>
+    <lastmod>2025-09-02T16:12:15.677Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.sft.html</loc>
+    <lastmod>2025-09-02T16:12:15.895Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.chat_templates.html</loc>
+    <lastmod>2025-09-02T16:12:15.788Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.utils.html</loc>
+    <lastmod>2025-09-02T16:12:15.720Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.fetch.html</loc>
+    <lastmod>2025-09-02T16:12:15.202Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.builders.causal.html</loc>
+    <lastmod>2025-09-02T16:12:14.961Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.trl.html</loc>
+    <lastmod>2025-09-02T16:12:15.974Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.data.streaming.html</loc>
+    <lastmod>2025-09-02T16:12:15.888Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html</loc>
+    <lastmod>2025-09-02T16:12:15.740Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.base.html</loc>
+    <lastmod>2025-09-02T16:12:15.365Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.merge_lora.html</loc>
+    <lastmod>2025-09-02T16:12:15.143Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html</loc>
+    <lastmod>2025-09-02T16:12:15.414Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.constants.html</loc>
+    <lastmod>2025-09-02T16:12:15.324Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html</loc>
+    <lastmod>2025-09-02T16:12:15.738Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.relora.html</loc>
+    <lastmod>2025-09-02T16:12:15.682Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schedulers.html</loc>
+    <lastmod>2025-09-02T16:12:15.854Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.lora.html</loc>
+    <lastmod>2025-09-02T16:12:15.639Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.batching.html</loc>
+    <lastmod>2025-09-02T16:12:16.208Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html</loc>
+    <lastmod>2025-09-02T16:12:15.452Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html</loc>
+    <lastmod>2025-09-02T16:12:15.274Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.architectures.html</loc>
+    <lastmod>2025-09-02T16:12:16.170Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.html</loc>
+    <lastmod>2025-09-02T16:12:15.185Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html</loc>
+    <lastmod>2025-09-02T16:12:15.548Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.training_args.html</loc>
+    <lastmod>2025-09-02T16:12:14.978Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html</loc>
+    <lastmod>2025-09-02T16:12:15.962Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html</loc>
+    <lastmod>2025-09-02T16:12:15.398Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html</loc>
+    <lastmod>2025-09-02T16:12:15.425Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html</loc>
+    <lastmod>2025-09-02T16:12:16.150Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.const.html</loc>
+    <lastmod>2025-09-02T16:12:16.172Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html</loc>
+    <lastmod>2025-09-02T16:12:15.712Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.quantize.html</loc>
+    <lastmod>2025-09-02T16:12:15.667Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.config.html</loc>
+    <lastmod>2025-09-02T16:12:15.116Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html</loc>
+    <lastmod>2025-09-02T16:12:15.446Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html</loc>
+    <lastmod>2025-09-02T16:12:15.507Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.geglu.html</loc>
+    <lastmod>2025-09-02T16:12:15.650Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.freeze.html</loc>
+    <lastmod>2025-09-02T16:12:15.810Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html</loc>
+    <lastmod>2025-09-02T16:12:16.272Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html</loc>
+    <lastmod>2025-09-02T16:12:15.286Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.train.html</loc>
+    <lastmod>2025-09-02T16:12:15.225Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.load.html</loc>
+    <lastmod>2025-09-02T16:12:15.208Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.collators.core.html</loc>
+    <lastmod>2025-09-02T16:12:16.189Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.liger.args.html</loc>
+    <lastmod>2025-09-02T16:12:16.162Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.preprocess.html</loc>
+    <lastmod>2025-09-02T16:12:15.162Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/kernels.utils.html</loc>
+    <lastmod>2025-09-02T16:12:15.669Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html</loc>
+    <lastmod>2025-09-02T16:12:16.268Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html</loc>
+    <lastmod>2025-09-02T16:12:15.749Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html</loc>
+    <lastmod>2025-09-02T16:12:15.330Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/loaders.processor.html</loc>
+    <lastmod>2025-09-02T16:12:15.307Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.trainers.utils.html</loc>
+    <lastmod>2025-09-02T16:12:15.288Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html</loc>
+    <lastmod>2025-09-02T16:12:15.485Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_tokenizers.html</loc>
+    <lastmod>2025-09-02T16:12:14.940Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html</loc>
+    <lastmod>2025-09-02T16:12:16.275Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html</loc>
+    <lastmod>2025-09-02T16:12:15.019Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.trainer.html</loc>
+    <lastmod>2025-09-02T16:12:15.827Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.train.html</loc>
+    <lastmod>2025-09-02T16:12:15.061Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html</loc>
+    <lastmod>2025-09-02T16:12:16.165Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html</loc>
+    <lastmod>2025-09-02T16:12:15.484Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/utils.schemas.peft.html</loc>
+    <lastmod>2025-09-02T16:12:15.970Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html</loc>
+    <lastmod>2025-09-02T16:12:15.552Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/cli.utils.args.html</loc>
+    <lastmod>2025-09-02T16:12:15.197Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html</loc>
+    <lastmod>2025-09-02T16:12:15.462Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/common.datasets.html</loc>
+    <lastmod>2025-09-02T16:12:16.186Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html</loc>
+    <lastmod>2025-09-02T16:12:15.528Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/api/train.html</loc>
+    <lastmod>2025-09-02T16:12:14.869Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multi-node.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/input_output.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/ray-integration.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/getting-started.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/optimizers.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/multi-gpu.html</loc>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/dataset_preprocessing.html</loc>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
+  </url>
+  <url>
+    <loc>https://docs.axolotl.ai/docs/torchao.html</loc>
+    <lastmod>2025-09-02T16:08:56.250Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/config-reference.html</loc>
-    <lastmod>2025-08-29T17:56:40.104Z</lastmod>
+    <lastmod>2025-09-02T16:12:29.266Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/rlhf.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/cli.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.245Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/fsdp_qlora.html</loc>
-    <lastmod>2025-08-29T17:52:59.092Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.246Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/docs/qat.html</loc>
-    <lastmod>2025-08-29T17:52:59.096Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.249Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2025-08-29T17:52:59.117Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.273Z</lastmod>
   </url>
   <url>
     <loc>https://docs.axolotl.ai/index.html</loc>
-    <lastmod>2025-08-29T17:52:59.112Z</lastmod>
+    <lastmod>2025-09-02T16:08:56.266Z</lastmod>
   </url>
 </urlset>
diff --git a/src/axolotl/integrations/LICENSE.html b/src/axolotl/integrations/LICENSE.html
index 9ad6bf121..e6e2c5a5b 100644
--- a/src/axolotl/integrations/LICENSE.html
+++ b/src/axolotl/integrations/LICENSE.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container"> 
diff --git a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
index ca49ad9f9..d0de08ce4 100644
--- a/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
+++ b/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
@@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
   <a href="../../../../docs/dataset_preprocessing.html" class="sidebar-item-text sidebar-link">
  <span class="menu-text">Dataset Preprocessing</span></a>
   </div>
+</li>
+          <li class="sidebar-item">
+  <div class="sidebar-item-container"> 
+  <a href="../../../../docs/streaming.html" class="sidebar-item-text sidebar-link">
+ <span class="menu-text">Streaming Datasets</span></a>
+  </div>
 </li>
           <li class="sidebar-item">
   <div class="sidebar-item-container">